2005-11-17 06:59:19 +08:00
|
|
|
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
|
2005-11-16 09:54:32 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-11-16 09:54:32 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file defines a DAG pattern matching instruction selector for X86,
|
|
|
|
// converting from a legalized dag to a X86 dag.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "X86.h"
|
2008-01-05 08:41:47 +08:00
|
|
|
#include "X86MachineFunctionInfo.h"
|
2006-01-11 09:15:34 +08:00
|
|
|
#include "X86RegisterInfo.h"
|
2005-11-16 09:54:32 +08:00
|
|
|
#include "X86Subtarget.h"
|
2006-03-14 07:20:37 +08:00
|
|
|
#include "X86TargetMachine.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2006-01-11 04:26:56 +08:00
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2005-11-16 09:54:32 +08:00
|
|
|
#include "llvm/CodeGen/SelectionDAGISel.h"
|
2018-04-30 22:59:11 +08:00
|
|
|
#include "llvm/Config/llvm-config.h"
|
2016-12-09 03:01:00 +08:00
|
|
|
#include "llvm/IR/ConstantRange.h"
|
2014-09-03 06:28:02 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
2006-09-08 14:48:29 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2017-04-28 13:31:46 +08:00
|
|
|
#include "llvm/Support/KnownBits.h"
|
2006-09-08 14:48:29 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2016-04-06 04:45:04 +08:00
|
|
|
#include <stdint.h>
|
2005-11-16 09:54:32 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 10:41:26 +08:00
|
|
|
#define DEBUG_TYPE "x86-isel"
|
|
|
|
|
2006-12-20 06:59:26 +08:00
|
|
|
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
|
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Pattern Matcher Implementation
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-11-19 10:11:08 +08:00
|
|
|
namespace {
|
2015-10-13 23:12:27 +08:00
|
|
|
/// This corresponds to X86AddressMode, but uses SDValue's instead of register
|
|
|
|
/// numbers for the leaves of the matched tree.
|
2005-11-19 10:11:08 +08:00
|
|
|
struct X86ISelAddressMode {
|
|
|
|
enum {
|
|
|
|
RegBase,
|
2006-05-25 01:04:05 +08:00
|
|
|
FrameIndexBase
|
2005-11-19 10:11:08 +08:00
|
|
|
} BaseType;
|
|
|
|
|
2010-04-30 07:30:41 +08:00
|
|
|
// This is really a union, discriminated by BaseType!
|
|
|
|
SDValue Base_Reg;
|
|
|
|
int Base_FrameIndex;
|
2005-11-19 10:11:08 +08:00
|
|
|
|
|
|
|
unsigned Scale;
|
2012-08-02 02:39:17 +08:00
|
|
|
SDValue IndexReg;
|
2008-11-11 23:52:29 +08:00
|
|
|
int32_t Disp;
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue Segment;
|
2010-04-15 09:51:59 +08:00
|
|
|
const GlobalValue *GV;
|
|
|
|
const Constant *CP;
|
|
|
|
const BlockAddress *BlockAddr;
|
2006-09-08 14:48:29 +08:00
|
|
|
const char *ES;
|
2015-06-23 01:46:53 +08:00
|
|
|
MCSymbol *MCSym;
|
2006-09-08 14:48:29 +08:00
|
|
|
int JT;
|
2006-02-25 18:09:08 +08:00
|
|
|
unsigned Align; // CP alignment.
|
2009-06-26 13:51:45 +08:00
|
|
|
unsigned char SymbolFlags; // X86II::MO_*
|
2005-11-19 10:11:08 +08:00
|
|
|
|
|
|
|
X86ISelAddressMode()
|
2015-06-23 01:46:53 +08:00
|
|
|
: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
|
|
|
|
Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
|
|
|
|
MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
|
2009-02-07 08:43:41 +08:00
|
|
|
|
|
|
|
bool hasSymbolicDisplacement() const {
|
2014-04-25 13:30:21 +08:00
|
|
|
return GV != nullptr || CP != nullptr || ES != nullptr ||
|
2015-06-23 01:46:53 +08:00
|
|
|
MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
|
2009-02-07 08:43:41 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
bool hasBaseOrIndexReg() const {
|
2013-09-19 19:33:53 +08:00
|
|
|
return BaseType == FrameIndexBase ||
|
2014-04-25 13:30:21 +08:00
|
|
|
IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return true if this addressing mode is already RIP-relative.
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
bool isRIPRelative() const {
|
|
|
|
if (BaseType != RegBase) return false;
|
|
|
|
if (RegisterSDNode *RegNode =
|
2010-04-30 07:30:41 +08:00
|
|
|
dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
return RegNode->getReg() == X86::RIP;
|
|
|
|
return false;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
void setBaseReg(SDValue Reg) {
|
|
|
|
BaseType = RegBase;
|
2010-04-30 07:30:41 +08:00
|
|
|
Base_Reg = Reg;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
}
|
2009-02-07 08:43:41 +08:00
|
|
|
|
2017-10-15 22:32:27 +08:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2018-03-17 05:10:07 +08:00
|
|
|
void dump(SelectionDAG *DAG = nullptr) {
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "X86ISelAddressMode " << this << '\n';
|
2010-04-30 07:30:41 +08:00
|
|
|
dbgs() << "Base_Reg ";
|
2014-04-28 12:05:08 +08:00
|
|
|
if (Base_Reg.getNode())
|
2018-03-17 05:10:07 +08:00
|
|
|
Base_Reg.getNode()->dump(DAG);
|
2009-08-08 05:33:25 +08:00
|
|
|
else
|
2017-12-23 01:18:10 +08:00
|
|
|
dbgs() << "nul\n";
|
|
|
|
if (BaseType == FrameIndexBase)
|
|
|
|
dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
|
|
|
|
dbgs() << " Scale " << Scale << '\n'
|
2009-08-23 19:52:17 +08:00
|
|
|
<< "IndexReg ";
|
2014-04-28 12:05:08 +08:00
|
|
|
if (IndexReg.getNode())
|
2018-03-17 05:10:07 +08:00
|
|
|
IndexReg.getNode()->dump(DAG);
|
2009-08-08 05:33:25 +08:00
|
|
|
else
|
2017-12-23 01:18:10 +08:00
|
|
|
dbgs() << "nul\n";
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << " Disp " << Disp << '\n'
|
2009-08-23 19:52:17 +08:00
|
|
|
<< "GV ";
|
2009-08-08 05:33:25 +08:00
|
|
|
if (GV)
|
|
|
|
GV->dump();
|
|
|
|
else
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "nul";
|
|
|
|
dbgs() << " CP ";
|
2009-08-08 05:33:25 +08:00
|
|
|
if (CP)
|
|
|
|
CP->dump();
|
|
|
|
else
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "nul";
|
|
|
|
dbgs() << '\n'
|
2009-08-23 19:52:17 +08:00
|
|
|
<< "ES ";
|
2009-08-08 05:33:25 +08:00
|
|
|
if (ES)
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << ES;
|
2009-08-08 05:33:25 +08:00
|
|
|
else
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "nul";
|
2015-06-23 01:46:53 +08:00
|
|
|
dbgs() << " MCSym ";
|
|
|
|
if (MCSym)
|
|
|
|
dbgs() << MCSym;
|
|
|
|
else
|
|
|
|
dbgs() << "nul";
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << " JT" << JT << " Align" << Align << '\n';
|
2008-08-12 07:46:25 +08:00
|
|
|
}
|
2012-09-07 03:06:06 +08:00
|
|
|
#endif
|
2005-11-19 10:11:08 +08:00
|
|
|
};
|
2016-04-06 04:45:04 +08:00
|
|
|
}
|
2005-11-19 10:11:08 +08:00
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
namespace {
|
|
|
|
//===--------------------------------------------------------------------===//
|
2015-10-13 23:12:27 +08:00
|
|
|
/// ISel - X86-specific code to select X86 machine instructions for
|
2005-11-16 09:54:32 +08:00
|
|
|
/// SelectionDAG operations.
|
|
|
|
///
|
2014-03-31 14:22:15 +08:00
|
|
|
class X86DAGToDAGISel final : public SelectionDAGISel {
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Keep a pointer to the X86Subtarget around so that we can
|
2005-11-16 09:54:32 +08:00
|
|
|
/// make the right decision when generating code for different targets.
|
|
|
|
const X86Subtarget *Subtarget;
|
2006-02-18 08:15:05 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// If true, selector should try to optimize for code size instead of
|
|
|
|
/// performance.
|
2008-09-27 07:41:32 +08:00
|
|
|
bool OptForSize;
|
|
|
|
|
2016-03-25 09:10:56 +08:00
|
|
|
/// If true, selector should try to optimize for minimum code size.
|
|
|
|
bool OptForMinSize;
|
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
public:
|
2009-04-30 07:29:43 +08:00
|
|
|
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
|
2016-03-25 09:10:56 +08:00
|
|
|
: SelectionDAGISel(tm, OptLevel), OptForSize(false),
|
2017-12-02 06:20:26 +08:00
|
|
|
OptForMinSize(false) {}
|
2005-11-16 09:54:32 +08:00
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override {
|
2005-11-16 09:54:32 +08:00
|
|
|
return "X86 DAG->DAG Instruction Selection";
|
|
|
|
}
|
|
|
|
|
2014-05-22 09:53:26 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
|
|
// Reset the subtarget each time through.
|
2015-02-03 01:38:43 +08:00
|
|
|
Subtarget = &MF.getSubtarget<X86Subtarget>();
|
2014-05-22 09:53:26 +08:00
|
|
|
SelectionDAGISel::runOnMachineFunction(MF);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
void EmitFunctionEntryCode() override;
|
2007-09-26 05:52:30 +08:00
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
|
2010-02-16 03:41:07 +08:00
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
void PreprocessISelDAG() override;
|
2018-03-17 01:13:42 +08:00
|
|
|
void PostprocessISelDAG() override;
|
2010-03-02 14:34:30 +08:00
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
// Include the pieces autogenerated from the target description.
|
|
|
|
#include "X86GenDAGISel.inc"
|
|
|
|
|
|
|
|
private:
|
2016-05-11 07:55:37 +08:00
|
|
|
void Select(SDNode *N) override;
|
2015-10-14 00:23:00 +08:00
|
|
|
|
|
|
|
bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
|
|
|
|
bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
|
|
|
|
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
|
|
|
|
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
|
2017-11-14 01:53:59 +08:00
|
|
|
bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
|
2015-10-22 02:56:06 +08:00
|
|
|
bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
|
2009-07-23 07:26:55 +08:00
|
|
|
unsigned Depth);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
|
|
|
|
bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
2015-04-30 16:38:48 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
|
|
|
|
bool selectLEAAddr(SDValue N, SDValue &Base,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectLEA64_32Addr(SDValue N, SDValue &Base,
|
2013-06-11 04:43:49 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectScalarSSELoad(SDNode *Root, SDValue N,
|
2010-02-17 06:35:06 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Segment,
|
2010-02-21 11:17:59 +08:00
|
|
|
SDValue &NodeWithChain);
|
2016-11-10 07:53:43 +08:00
|
|
|
bool selectRelocImm(SDValue N, SDValue &Op);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2017-11-09 04:17:33 +08:00
|
|
|
bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2018-04-28 06:15:33 +08:00
|
|
|
// Convenience method where P is also root.
|
2017-11-09 04:17:33 +08:00
|
|
|
bool tryFoldLoad(SDNode *P, SDValue N,
|
|
|
|
SDValue &Base, SDValue &Scale,
|
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
|
|
|
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
|
|
|
|
}
|
|
|
|
|
2018-04-28 06:15:33 +08:00
|
|
|
// Try to fold a vector load. This makes sure the load isn't non-temporal.
|
|
|
|
bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
|
|
|
|
SDValue &Base, SDValue &Scale,
|
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Implement addressing mode selection for inline asm expressions.
|
2014-03-09 15:44:38 +08:00
|
|
|
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
|
2015-03-13 20:45:09 +08:00
|
|
|
unsigned ConstraintID,
|
2014-03-09 15:44:38 +08:00
|
|
|
std::vector<SDValue> &OutOps) override;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
void emitSpecialCodeForMain();
|
2007-09-26 05:52:30 +08:00
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
2014-10-08 15:32:17 +08:00
|
|
|
Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
|
2015-07-09 10:09:04 +08:00
|
|
|
? CurDAG->getTargetFrameIndex(
|
|
|
|
AM.Base_FrameIndex,
|
|
|
|
TLI->getPointerTy(CurDAG->getDataLayout()))
|
2014-10-08 15:32:17 +08:00
|
|
|
: AM.Base_Reg;
|
2015-04-28 22:05:47 +08:00
|
|
|
Scale = getI8Imm(AM.Scale, DL);
|
2005-12-13 05:49:40 +08:00
|
|
|
Index = AM.IndexReg;
|
2015-10-13 23:12:27 +08:00
|
|
|
// These are 32-bit even in 64-bit mode since RIP-relative offset
|
2006-09-08 14:48:29 +08:00
|
|
|
// is 32-bit.
|
|
|
|
if (AM.GV)
|
2013-05-25 10:42:55 +08:00
|
|
|
Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
|
2010-07-07 06:08:15 +08:00
|
|
|
MVT::i32, AM.Disp,
|
2009-06-26 13:51:45 +08:00
|
|
|
AM.SymbolFlags);
|
2006-09-08 14:48:29 +08:00
|
|
|
else if (AM.CP)
|
2009-08-12 04:47:22 +08:00
|
|
|
Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
|
2009-06-26 13:51:45 +08:00
|
|
|
AM.Align, AM.Disp, AM.SymbolFlags);
|
2012-09-13 05:43:09 +08:00
|
|
|
else if (AM.ES) {
|
|
|
|
assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
|
2009-08-12 04:47:22 +08:00
|
|
|
Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
|
2015-06-23 01:46:53 +08:00
|
|
|
} else if (AM.MCSym) {
|
|
|
|
assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
|
|
|
|
assert(AM.SymbolFlags == 0 && "oo");
|
|
|
|
Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
|
2012-09-13 05:43:09 +08:00
|
|
|
} else if (AM.JT != -1) {
|
|
|
|
assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
|
2009-08-12 04:47:22 +08:00
|
|
|
Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
|
2012-09-13 05:43:09 +08:00
|
|
|
} else if (AM.BlockAddr)
|
|
|
|
Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
|
|
|
|
AM.SymbolFlags);
|
2006-09-08 14:48:29 +08:00
|
|
|
else
|
2015-04-28 22:05:47 +08:00
|
|
|
Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
|
2009-04-09 05:14:34 +08:00
|
|
|
|
|
|
|
if (AM.Segment.getNode())
|
|
|
|
Segment = AM.Segment;
|
|
|
|
else
|
2009-08-12 04:47:22 +08:00
|
|
|
Segment = CurDAG->getRegister(0, MVT::i32);
|
2005-12-13 05:49:40 +08:00
|
|
|
}
|
|
|
|
|
2015-08-11 22:10:58 +08:00
|
|
|
// Utility function to determine whether we should avoid selecting
|
|
|
|
// immediate forms of instructions for better code size or not.
|
|
|
|
// At a high level, we'd like to avoid such instructions when
|
|
|
|
// we have similar constants used within the same basic block
|
|
|
|
// that can be kept in a register.
|
|
|
|
//
|
|
|
|
bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
|
|
|
|
uint32_t UseCount = 0;
|
|
|
|
|
|
|
|
// Do not want to hoist if we're not optimizing for size.
|
|
|
|
// TODO: We'd like to remove this restriction.
|
|
|
|
// See the comment in X86InstrInfo.td for more info.
|
|
|
|
if (!OptForSize)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Walk all the users of the immediate.
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(),
|
|
|
|
UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2015-08-11 22:10:58 +08:00
|
|
|
SDNode *User = *UI;
|
|
|
|
|
|
|
|
// This user is already selected. Count it as a legitimate use and
|
|
|
|
// move on.
|
|
|
|
if (User->isMachineOpcode()) {
|
|
|
|
UseCount++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We want to count stores of immediates as real uses.
|
|
|
|
if (User->getOpcode() == ISD::STORE &&
|
|
|
|
User->getOperand(1).getNode() == N) {
|
|
|
|
UseCount++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We don't currently match users that have > 2 operands (except
|
|
|
|
// for stores, which are handled above)
|
|
|
|
// Those instruction won't match in ISEL, for now, and would
|
|
|
|
// be counted incorrectly.
|
|
|
|
// This may change in the future as we add additional instruction
|
|
|
|
// types.
|
|
|
|
if (User->getNumOperands() != 2)
|
|
|
|
continue;
|
2016-05-06 07:19:08 +08:00
|
|
|
|
2015-08-11 22:10:58 +08:00
|
|
|
// Immediates that are used for offsets as part of stack
|
|
|
|
// manipulation should be left alone. These are typically
|
|
|
|
// used to indicate SP offsets for argument passing and
|
|
|
|
// will get pulled into stores/pushes (implicitly).
|
|
|
|
if (User->getOpcode() == X86ISD::ADD ||
|
|
|
|
User->getOpcode() == ISD::ADD ||
|
|
|
|
User->getOpcode() == X86ISD::SUB ||
|
|
|
|
User->getOpcode() == ISD::SUB) {
|
|
|
|
|
|
|
|
// Find the other operand of the add/sub.
|
|
|
|
SDValue OtherOp = User->getOperand(0);
|
|
|
|
if (OtherOp.getNode() == N)
|
|
|
|
OtherOp = User->getOperand(1);
|
|
|
|
|
|
|
|
// Don't count if the other operand is SP.
|
|
|
|
RegisterSDNode *RegNode;
|
|
|
|
if (OtherOp->getOpcode() == ISD::CopyFromReg &&
|
|
|
|
(RegNode = dyn_cast_or_null<RegisterSDNode>(
|
|
|
|
OtherOp->getOperand(1).getNode())))
|
|
|
|
if ((RegNode->getReg() == X86::ESP) ||
|
|
|
|
(RegNode->getReg() == X86::RSP))
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// ... otherwise, count this and move on.
|
|
|
|
UseCount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we have more than 1 use, then recommend for hoisting.
|
|
|
|
return (UseCount > 1);
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a target constant with the specified value of type i8.
|
2016-06-12 23:39:02 +08:00
|
|
|
inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a target constant with the specified value, of type i32.
|
2016-06-12 23:39:02 +08:00
|
|
|
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|
2006-02-11 06:24:32 +08:00
|
|
|
|
2018-02-16 03:57:35 +08:00
|
|
|
/// Return a target constant with the specified value, of type i64.
|
|
|
|
inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
|
|
|
|
return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
|
|
|
|
}
|
|
|
|
|
2017-09-23 13:34:07 +08:00
|
|
|
SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
|
|
|
|
const SDLoc &DL) {
|
|
|
|
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
|
|
|
|
uint64_t Index = N->getConstantOperandVal(1);
|
|
|
|
MVT VecVT = N->getOperand(0).getSimpleValueType();
|
2017-10-08 09:33:42 +08:00
|
|
|
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
|
2017-09-23 13:34:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
|
|
|
|
const SDLoc &DL) {
|
|
|
|
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
|
|
|
|
uint64_t Index = N->getConstantOperandVal(2);
|
|
|
|
MVT VecVT = N->getSimpleValueType(0);
|
2017-10-08 09:33:42 +08:00
|
|
|
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
|
2017-09-23 13:34:07 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return an SDNode that returns the value of the global base register.
|
|
|
|
/// Output instructions required to initialize the global base register,
|
|
|
|
/// if necessary.
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *getGlobalBaseReg();
|
2006-02-18 08:15:05 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a reference to the TargetMachine, casted to the target-specific
|
|
|
|
/// type.
|
2013-02-20 05:54:59 +08:00
|
|
|
const X86TargetMachine &getTargetMachine() const {
|
2009-06-04 04:20:00 +08:00
|
|
|
return static_cast<const X86TargetMachine &>(TM);
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a reference to the TargetInstrInfo, casted to the target-specific
|
|
|
|
/// type.
|
2013-02-20 05:54:59 +08:00
|
|
|
const X86InstrInfo *getInstrInfo() const {
|
2015-02-03 01:38:43 +08:00
|
|
|
return Subtarget->getInstrInfo();
|
2009-06-04 04:20:00 +08:00
|
|
|
}
|
2014-10-04 04:00:34 +08:00
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Address-mode matching performs shift-of-and to and-of-shift
|
2014-10-04 04:00:34 +08:00
|
|
|
/// reassociation in order to expose more scaled addressing
|
|
|
|
/// opportunities.
|
|
|
|
bool ComplexPatternFuncMutatesDAG() const override {
|
|
|
|
return true;
|
|
|
|
}
|
2017-02-10 06:02:28 +08:00
|
|
|
|
|
|
|
bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
|
|
|
|
|
|
|
|
/// Returns whether this is a relocatable immediate in the range
|
|
|
|
/// [-2^Width .. 2^Width-1].
|
|
|
|
template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
|
|
|
|
if (auto *CN = dyn_cast<ConstantSDNode>(N))
|
|
|
|
return isInt<Width>(CN->getSExtValue());
|
|
|
|
return isSExtAbsoluteSymbolRef(Width, N);
|
|
|
|
}
|
2017-08-20 07:21:22 +08:00
|
|
|
|
|
|
|
// Indicates we should prefer to use a non-temporal load for this load.
|
|
|
|
bool useNonTemporalLoad(LoadSDNode *N) const {
|
|
|
|
if (!N->isNonTemporal())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned StoreSize = N->getMemoryVT().getStoreSize();
|
|
|
|
|
|
|
|
if (N->getAlignment() < StoreSize)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (StoreSize) {
|
|
|
|
default: llvm_unreachable("Unsupported store size");
|
|
|
|
case 16:
|
|
|
|
return Subtarget->hasSSE41();
|
|
|
|
case 32:
|
|
|
|
return Subtarget->hasAVX2();
|
|
|
|
case 64:
|
|
|
|
return Subtarget->hasAVX512();
|
|
|
|
}
|
|
|
|
}
|
2017-08-25 10:04:03 +08:00
|
|
|
|
|
|
|
bool foldLoadStoreIntoMemOperand(SDNode *Node);
|
2017-09-13 01:40:25 +08:00
|
|
|
bool matchBEXTRFromAnd(SDNode *Node);
|
2018-01-20 00:37:25 +08:00
|
|
|
bool shrinkAndImmediate(SDNode *N);
|
2017-09-26 02:43:13 +08:00
|
|
|
bool isMaskZeroExtended(SDNode *N) const;
|
2018-04-28 06:15:33 +08:00
|
|
|
|
|
|
|
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
|
|
|
|
const SDLoc &dl, MVT VT, SDNode *Node);
|
|
|
|
MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
|
|
|
|
const SDLoc &dl, MVT VT, SDNode *Node,
|
|
|
|
SDValue &InFlag);
|
2005-11-16 09:54:32 +08:00
|
|
|
};
|
2016-04-06 04:45:04 +08:00
|
|
|
}
|
|
|
|
|
2006-08-08 08:31:00 +08:00
|
|
|
|
2017-09-26 02:43:13 +08:00
|
|
|
// Returns true if this masked compare can be implemented legally with this
|
|
|
|
// type.
|
|
|
|
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
|
2017-11-06 17:22:38 +08:00
|
|
|
unsigned Opcode = N->getOpcode();
|
2018-01-28 08:56:30 +08:00
|
|
|
if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMU ||
|
2018-02-28 16:14:28 +08:00
|
|
|
Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
|
2017-09-26 02:43:13 +08:00
|
|
|
// We can get 256-bit 8 element types here without VLX being enabled. When
|
|
|
|
// this happens we will use 512-bit operations and the mask will not be
|
|
|
|
// zero extended.
|
2017-11-06 16:32:45 +08:00
|
|
|
EVT OpVT = N->getOperand(0).getValueType();
|
2018-01-08 02:20:37 +08:00
|
|
|
if (OpVT.is256BitVector() || OpVT.is128BitVector())
|
2017-09-26 02:43:13 +08:00
|
|
|
return Subtarget->hasVLX();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2018-02-28 16:14:28 +08:00
|
|
|
// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
|
|
|
|
if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
|
|
|
|
Opcode == X86ISD::FSETCCM_RND)
|
|
|
|
return true;
|
2017-09-26 02:43:13 +08:00
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns true if we can assume the writer of the mask has zero extended it
|
|
|
|
// for us.
|
|
|
|
bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
|
|
|
|
// If this is an AND, check if we have a compare on either side. As long as
|
|
|
|
// one side guarantees the mask is zero extended, the AND will preserve those
|
|
|
|
// zeros.
|
|
|
|
if (N->getOpcode() == ISD::AND)
|
|
|
|
return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
|
|
|
|
isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
|
|
|
|
|
|
|
|
return isLegalMaskCompare(N, Subtarget);
|
|
|
|
}
|
|
|
|
|
2010-02-16 03:41:07 +08:00
|
|
|
bool
|
|
|
|
X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
|
2009-04-30 07:29:43 +08:00
|
|
|
if (OptLevel == CodeGenOpt::None) return false;
|
2006-10-14 16:33:25 +08:00
|
|
|
|
2010-02-16 03:41:07 +08:00
|
|
|
if (!N.hasOneUse())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (N.getOpcode() != ISD::LOAD)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// If N is a load, do additional profitability checks.
|
|
|
|
if (U == Root) {
|
2008-11-27 08:49:46 +08:00
|
|
|
switch (U->getOpcode()) {
|
|
|
|
default: break;
|
2010-01-05 04:51:50 +08:00
|
|
|
case X86ISD::ADD:
|
|
|
|
case X86ISD::SUB:
|
|
|
|
case X86ISD::AND:
|
|
|
|
case X86ISD::XOR:
|
|
|
|
case X86ISD::OR:
|
2008-11-27 08:49:46 +08:00
|
|
|
case ISD::ADD:
|
2017-05-01 03:24:09 +08:00
|
|
|
case ISD::ADDCARRY:
|
2008-11-27 08:49:46 +08:00
|
|
|
case ISD::AND:
|
|
|
|
case ISD::OR:
|
|
|
|
case ISD::XOR: {
|
2009-04-10 18:09:34 +08:00
|
|
|
SDValue Op1 = U->getOperand(1);
|
|
|
|
|
2008-11-27 08:49:46 +08:00
|
|
|
// If the other operand is a 8-bit immediate we should fold the immediate
|
|
|
|
// instead. This reduces code size.
|
|
|
|
// e.g.
|
|
|
|
// movl 4(%esp), %eax
|
|
|
|
// addl $4, %eax
|
|
|
|
// vs.
|
|
|
|
// movl $4, %eax
|
|
|
|
// addl 4(%esp), %eax
|
|
|
|
// The former is 2 bytes shorter. In case where the increment is 1, then
|
|
|
|
// the saving can be 4 bytes (by using incl %eax).
|
2018-04-10 11:44:15 +08:00
|
|
|
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
|
2009-03-14 10:07:16 +08:00
|
|
|
if (Imm->getAPIntValue().isSignedIntN(8))
|
|
|
|
return false;
|
2009-04-10 18:09:34 +08:00
|
|
|
|
2018-04-10 11:44:15 +08:00
|
|
|
// If this is a 64-bit AND with an immediate that fits in 32-bits,
|
|
|
|
// prefer using the smaller and over folding the load. This is needed to
|
|
|
|
// make sure immediates created by shrinkAndImmediate are always folded.
|
|
|
|
// Ideally we would narrow the load during DAG combine and get the
|
|
|
|
// best of both worlds.
|
|
|
|
if (U->getOpcode() == ISD::AND &&
|
|
|
|
Imm->getAPIntValue().getBitWidth() == 64 &&
|
|
|
|
Imm->getAPIntValue().isIntN(32))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-04-10 18:09:34 +08:00
|
|
|
// If the other operand is a TLS address, we should fold it instead.
|
|
|
|
// This produces
|
|
|
|
// movl %gs:0, %eax
|
|
|
|
// leal i@NTPOFF(%eax), %eax
|
|
|
|
// instead of
|
|
|
|
// movl $i@NTPOFF, %eax
|
|
|
|
// addl %gs:0, %eax
|
|
|
|
// if the block also has an access to a second TLS address this will save
|
|
|
|
// a load.
|
2013-12-05 13:44:44 +08:00
|
|
|
// FIXME: This is probably also true for non-TLS addresses.
|
2009-04-10 18:09:34 +08:00
|
|
|
if (Op1.getOpcode() == X86ISD::Wrapper) {
|
|
|
|
SDValue Val = Op1.getOperand(0);
|
|
|
|
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
|
|
|
|
return false;
|
|
|
|
}
|
2008-11-27 08:49:46 +08:00
|
|
|
}
|
|
|
|
}
|
2010-02-16 03:41:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Replace the original chain operand of the call with
|
2010-03-14 11:48:46 +08:00
|
|
|
/// load's chain operand and move load below the call's chain operand.
|
2016-04-06 04:45:04 +08:00
|
|
|
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
|
|
|
|
SDValue Call, SDValue OrigChain) {
|
2008-08-26 05:27:18 +08:00
|
|
|
SmallVector<SDValue, 8> Ops;
|
2010-03-14 11:48:46 +08:00
|
|
|
SDValue Chain = OrigChain.getOperand(0);
|
2009-01-27 02:43:34 +08:00
|
|
|
if (Chain.getNode() == Load.getNode())
|
|
|
|
Ops.push_back(Load.getOperand(0));
|
|
|
|
else {
|
|
|
|
assert(Chain.getOpcode() == ISD::TokenFactor &&
|
2010-03-14 11:48:46 +08:00
|
|
|
"Unexpected chain operand");
|
2009-01-27 02:43:34 +08:00
|
|
|
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
|
|
|
|
if (Chain.getOperand(i).getNode() == Load.getNode())
|
|
|
|
Ops.push_back(Load.getOperand(0));
|
|
|
|
else
|
|
|
|
Ops.push_back(Chain.getOperand(i));
|
|
|
|
SDValue NewChain =
|
2014-04-27 02:35:24 +08:00
|
|
|
CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
|
2009-01-27 02:43:34 +08:00
|
|
|
Ops.clear();
|
|
|
|
Ops.push_back(NewChain);
|
|
|
|
}
|
2015-02-17 23:29:18 +08:00
|
|
|
Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
|
2014-04-28 13:57:50 +08:00
|
|
|
CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
|
2010-06-18 23:30:29 +08:00
|
|
|
CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
|
2008-08-26 05:27:18 +08:00
|
|
|
Load.getOperand(1), Load.getOperand(2));
|
2012-10-03 07:49:13 +08:00
|
|
|
|
2008-08-26 05:27:18 +08:00
|
|
|
Ops.clear();
|
2008-08-29 05:40:38 +08:00
|
|
|
Ops.push_back(SDValue(Load.getNode(), 1));
|
2015-02-17 23:29:18 +08:00
|
|
|
Ops.append(Call->op_begin() + 1, Call->op_end());
|
2014-04-28 13:57:50 +08:00
|
|
|
CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
|
2008-08-26 05:27:18 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return true if call address is a load and it can be
|
2008-08-26 05:27:18 +08:00
|
|
|
/// moved below CALLSEQ_START and the chains leading up to the call.
|
|
|
|
/// Return the CALLSEQ_START by reference as a second output.
|
2010-03-14 11:48:46 +08:00
|
|
|
/// In the case of a tail call, there isn't a callseq node between the call
|
|
|
|
/// chain and the load.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
|
2012-10-05 09:48:22 +08:00
|
|
|
// The transformation is somewhat dangerous if the call's chain was glued to
|
|
|
|
// the call. After MoveBelowOrigChain the load is moved between the call and
|
|
|
|
// the chain, this can create a cycle if the load is not folded. So it is
|
|
|
|
// *really* important that we are sure the load will be folded.
|
2008-08-29 05:40:38 +08:00
|
|
|
if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
|
2008-08-26 05:27:18 +08:00
|
|
|
return false;
|
2008-08-29 05:40:38 +08:00
|
|
|
LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
|
2008-08-26 05:27:18 +08:00
|
|
|
if (!LD ||
|
|
|
|
LD->isVolatile() ||
|
|
|
|
LD->getAddressingMode() != ISD::UNINDEXED ||
|
|
|
|
LD->getExtensionType() != ISD::NON_EXTLOAD)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Now let's find the callseq_start.
|
2010-03-14 11:48:46 +08:00
|
|
|
while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
|
2008-08-26 05:27:18 +08:00
|
|
|
if (!Chain.hasOneUse())
|
|
|
|
return false;
|
|
|
|
Chain = Chain.getOperand(0);
|
|
|
|
}
|
2010-03-14 11:48:46 +08:00
|
|
|
|
|
|
|
if (!Chain.getNumOperands())
|
|
|
|
return false;
|
2013-01-07 03:00:15 +08:00
|
|
|
// Since we are not checking for AA here, conservatively abort if the chain
|
|
|
|
// writes to memory. It's not safe to move the callee (a load) across a store.
|
|
|
|
if (isa<MemSDNode>(Chain.getNode()) &&
|
|
|
|
cast<MemSDNode>(Chain.getNode())->writeMem())
|
|
|
|
return false;
|
2009-01-27 02:43:34 +08:00
|
|
|
if (Chain.getOperand(0).getNode() == Callee.getNode())
|
|
|
|
return true;
|
|
|
|
if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
|
2009-09-15 09:22:01 +08:00
|
|
|
Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
|
|
|
|
Callee.getValue(1).hasOneUse())
|
2009-01-27 02:43:34 +08:00
|
|
|
return true;
|
|
|
|
return false;
|
2008-08-26 05:27:18 +08:00
|
|
|
}
|
|
|
|
|
2010-03-03 07:12:51 +08:00
|
|
|
void X86DAGToDAGISel::PreprocessISelDAG() {
|
2016-03-25 09:10:56 +08:00
|
|
|
// OptFor[Min]Size are used in pattern predicates that isel is matching.
|
2017-12-16 06:22:58 +08:00
|
|
|
OptForSize = MF->getFunction().optForSize();
|
|
|
|
OptForMinSize = MF->getFunction().optForMinSize();
|
2016-03-25 09:10:56 +08:00
|
|
|
assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2008-08-23 10:25:05 +08:00
|
|
|
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
|
|
|
|
E = CurDAG->allnodes_end(); I != E; ) {
|
2015-10-20 05:48:29 +08:00
|
|
|
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
|
2010-03-03 07:12:51 +08:00
|
|
|
|
2018-02-02 01:08:39 +08:00
|
|
|
// If this is a target specific AND node with no flag usages, turn it back
|
|
|
|
// into ISD::AND to enable test instruction matching.
|
|
|
|
if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
|
|
|
|
SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
|
|
|
|
N->getOperand(0), N->getOperand(1));
|
|
|
|
--I;
|
|
|
|
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
|
|
|
|
++I;
|
|
|
|
CurDAG->DeleteNode(N);
|
|
|
|
}
|
|
|
|
|
2010-03-14 11:48:46 +08:00
|
|
|
if (OptLevel != CodeGenOpt::None &&
|
Introduce the "retpoline" x86 mitigation technique for variant #2 of the speculative execution vulnerabilities disclosed today, specifically identified by CVE-2017-5715, "Branch Target Injection", and is one of the two halves to Spectre..
Summary:
First, we need to explain the core of the vulnerability. Note that this
is a very incomplete description, please see the Project Zero blog post
for details:
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
The basis for branch target injection is to direct speculative execution
of the processor to some "gadget" of executable code by poisoning the
prediction of indirect branches with the address of that gadget. The
gadget in turn contains an operation that provides a side channel for
reading data. Most commonly, this will look like a load of secret data
followed by a branch on the loaded value and then a load of some
predictable cache line. The attacker then uses timing of the processors
cache to determine which direction the branch took *in the speculative
execution*, and in turn what one bit of the loaded value was. Due to the
nature of these timing side channels and the branch predictor on Intel
processors, this allows an attacker to leak data only accessible to
a privileged domain (like the kernel) back into an unprivileged domain.
The goal is simple: avoid generating code which contains an indirect
branch that could have its prediction poisoned by an attacker. In many
cases, the compiler can simply use directed conditional branches and
a small search tree. LLVM already has support for lowering switches in
this way and the first step of this patch is to disable jump-table
lowering of switches and introduce a pass to rewrite explicit indirectbr
sequences into a switch over integers.
However, there is no fully general alternative to indirect calls. We
introduce a new construct we call a "retpoline" to implement indirect
calls in a non-speculatable way. It can be thought of loosely as
a trampoline for indirect calls which uses the RET instruction on x86.
Further, we arrange for a specific call->ret sequence which ensures the
processor predicts the return to go to a controlled, known location. The
retpoline then "smashes" the return address pushed onto the stack by the
call with the desired target of the original indirect call. The result
is a predicted return to the next instruction after a call (which can be
used to trap speculative execution within an infinite loop) and an
actual indirect branch to an arbitrary address.
On 64-bit x86 ABIs, this is especially easily done in the compiler by
using a guaranteed scratch register to pass the target into this device.
For 32-bit ABIs there isn't a guaranteed scratch register and so several
different retpoline variants are introduced to use a scratch register if
one is available in the calling convention and to otherwise use direct
stack push/pop sequences to pass the target address.
This "retpoline" mitigation is fully described in the following blog
post: https://support.google.com/faqs/answer/7625886
We also support a target feature that disables emission of the retpoline
thunk by the compiler to allow for custom thunks if users want them.
These are particularly useful in environments like kernels that
routinely do hot-patching on boot and want to hot-patch their thunk to
different code sequences. They can write this custom thunk and use
`-mretpoline-external-thunk` *in addition* to `-mretpoline`. In this
case, on x86-64 thu thunk names must be:
```
__llvm_external_retpoline_r11
```
or on 32-bit:
```
__llvm_external_retpoline_eax
__llvm_external_retpoline_ecx
__llvm_external_retpoline_edx
__llvm_external_retpoline_push
```
And the target of the retpoline is passed in the named register, or in
the case of the `push` suffix on the top of the stack via a `pushl`
instruction.
There is one other important source of indirect branches in x86 ELF
binaries: the PLT. These patches also include support for LLD to
generate PLT entries that perform a retpoline-style indirection.
The only other indirect branches remaining that we are aware of are from
precompiled runtimes (such as crt0.o and similar). The ones we have
found are not really attackable, and so we have not focused on them
here, but eventually these runtimes should also be replicated for
retpoline-ed configurations for completeness.
For kernels or other freestanding or fully static executables, the
compiler switch `-mretpoline` is sufficient to fully mitigate this
particular attack. For dynamic executables, you must compile *all*
libraries with `-mretpoline` and additionally link the dynamic
executable and all shared libraries with LLD and pass `-z retpolineplt`
(or use similar functionality from some other linker). We strongly
recommend also using `-z now` as non-lazy binding allows the
retpoline-mitigated PLT to be substantially smaller.
When manually apply similar transformations to `-mretpoline` to the
Linux kernel we observed very small performance hits to applications
running typical workloads, and relatively minor hits (approximately 2%)
even for extremely syscall-heavy applications. This is largely due to
the small number of indirect branches that occur in performance
sensitive paths of the kernel.
When using these patches on statically linked applications, especially
C++ applications, you should expect to see a much more dramatic
performance hit. For microbenchmarks that are switch, indirect-, or
virtual-call heavy we have seen overheads ranging from 10% to 50%.
However, real-world workloads exhibit substantially lower performance
impact. Notably, techniques such as PGO and ThinLTO dramatically reduce
the impact of hot indirect calls (by speculatively promoting them to
direct calls) and allow optimized search trees to be used to lower
switches. If you need to deploy these techniques in C++ applications, we
*strongly* recommend that you ensure all hot call targets are statically
linked (avoiding PLT indirection) and use both PGO and ThinLTO. Well
tuned servers using all of these techniques saw 5% - 10% overhead from
the use of retpoline.
We will add detailed documentation covering these components in
subsequent patches, but wanted to make the core functionality available
as soon as possible. Happy for more code review, but we'd really like to
get these patches landed and backported ASAP for obvious reasons. We're
planning to backport this to both 6.0 and 5.0 release streams and get
a 5.0 release with just this cherry picked ASAP for distros and vendors.
This patch is the work of a number of people over the past month: Eric, Reid,
Rui, and myself. I'm mailing it out as a single commit due to the time
sensitive nature of landing this and the need to backport it. Huge thanks to
everyone who helped out here, and everyone at Intel who helped out in
discussions about how to craft this. Also, credit goes to Paul Turner (at
Google, but not an LLVM contributor) for much of the underlying retpoline
design.
Reviewers: echristo, rnk, ruiu, craig.topper, DavidKreitzer
Subscribers: sanjoy, emaste, mcrosier, mgorny, mehdi_amini, hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D41723
llvm-svn: 323155
2018-01-23 06:05:25 +08:00
|
|
|
// Only do this when the target can fold the load into the call or
|
|
|
|
// jmp.
|
|
|
|
!Subtarget->useRetpoline() &&
|
2017-08-29 13:14:27 +08:00
|
|
|
((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
|
2012-10-05 09:48:22 +08:00
|
|
|
(N->getOpcode() == X86ISD::TC_RETURN &&
|
|
|
|
(Subtarget->is64Bit() ||
|
2016-06-28 05:33:08 +08:00
|
|
|
!getTargetMachine().isPositionIndependent())))) {
|
2010-03-03 07:12:51 +08:00
|
|
|
/// Also try moving call address load from outside callseq_start to just
|
|
|
|
/// before the call to allow it to be folded.
|
|
|
|
///
|
|
|
|
/// [Load chain]
|
|
|
|
/// ^
|
|
|
|
/// |
|
|
|
|
/// [Load]
|
|
|
|
/// ^ ^
|
|
|
|
/// | |
|
|
|
|
/// / \--
|
|
|
|
/// / |
|
|
|
|
///[CALLSEQ_START] |
|
|
|
|
/// ^ |
|
|
|
|
/// | |
|
|
|
|
/// [LOAD/C2Reg] |
|
|
|
|
/// | |
|
|
|
|
/// \ /
|
|
|
|
/// \ /
|
|
|
|
/// [CALL]
|
2010-03-14 11:48:46 +08:00
|
|
|
bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
|
2010-03-03 07:12:51 +08:00
|
|
|
SDValue Chain = N->getOperand(0);
|
|
|
|
SDValue Load = N->getOperand(1);
|
2010-03-14 11:48:46 +08:00
|
|
|
if (!isCalleeLoad(Load, Chain, HasCallSeq))
|
2010-03-03 07:12:51 +08:00
|
|
|
continue;
|
2015-10-14 00:23:00 +08:00
|
|
|
moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
|
2010-03-03 07:12:51 +08:00
|
|
|
++NumLoadMoved;
|
|
|
|
continue;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2010-03-03 07:12:51 +08:00
|
|
|
// Lower fpround and fpextend nodes that target the FP stack to be store and
|
|
|
|
// load to the stack. This is a gross hack. We would like to simply mark
|
|
|
|
// these as being illegal, but when we do that, legalize produces these when
|
|
|
|
// it expands calls, then expands these in the same legalize pass. We would
|
|
|
|
// like dag combine to be able to hack on these between the call expansion
|
|
|
|
// and the node legalization. As such this pass basically does "really
|
|
|
|
// late" legalization of these inline with the X86 isel pass.
|
|
|
|
// FIXME: This should only happen when not compiled with -O0.
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
|
|
|
|
continue;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT SrcVT = N->getOperand(0).getSimpleValueType();
|
|
|
|
MVT DstVT = N->getSimpleValueType(0);
|
2011-08-02 05:54:05 +08:00
|
|
|
|
|
|
|
// If any of the sources are vectors, no fp stack involved.
|
|
|
|
if (SrcVT.isVector() || DstVT.isVector())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// If the source and destination are SSE registers, then this is a legal
|
|
|
|
// conversion that should not be lowered.
|
2013-06-27 19:07:42 +08:00
|
|
|
const X86TargetLowering *X86Lowering =
|
2014-10-08 15:32:17 +08:00
|
|
|
static_cast<const X86TargetLowering *>(TLI);
|
2013-06-20 05:36:55 +08:00
|
|
|
bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
|
|
|
|
bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
if (SrcIsSSE && DstIsSSE)
|
|
|
|
continue;
|
|
|
|
|
2008-03-09 15:05:32 +08:00
|
|
|
if (!SrcIsSSE && !DstIsSSE) {
|
|
|
|
// If this is an FPStack extension, it is a noop.
|
|
|
|
if (N->getOpcode() == ISD::FP_EXTEND)
|
|
|
|
continue;
|
|
|
|
// If this is a value-preserving FPStack truncation, it is a noop.
|
|
|
|
if (N->getConstantOperandVal(1))
|
|
|
|
continue;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
|
|
|
|
// FPStack has extload and truncstore. SSE can fold direct loads into other
|
|
|
|
// operations. Based on this, decide what we want to do.
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT MemVT;
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
if (N->getOpcode() == ISD::FP_ROUND)
|
|
|
|
MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
|
|
|
|
else
|
|
|
|
MemVT = SrcIsSSE ? SrcVT : DstVT;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2008-08-23 10:25:05 +08:00
|
|
|
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
// FIXME: optimize the case where the src/dest is a load or store?
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
SDValue Store =
|
|
|
|
CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
|
|
|
|
MemTmp, MachinePointerInfo(), MemVT);
|
2011-02-17 00:23:55 +08:00
|
|
|
SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
MachinePointerInfo(), MemVT);
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
|
|
|
|
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
|
|
|
|
// extload we created. This will cause general havok on the dag because
|
|
|
|
// anything below the conversion could be folded into other existing nodes.
|
|
|
|
// To avoid invalidating 'I', back it up to the convert node.
|
|
|
|
--I;
|
2008-08-23 10:25:05 +08:00
|
|
|
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
// Now that we did that, the node is dead. Increment the iterator to the
|
|
|
|
// next node to process, then delete N.
|
|
|
|
++I;
|
2008-08-23 10:25:05 +08:00
|
|
|
CurDAG->DeleteNode(N);
|
2012-08-02 02:39:17 +08:00
|
|
|
}
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
}
|
|
|
|
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2018-03-17 01:13:42 +08:00
|
|
|
void X86DAGToDAGISel::PostprocessISelDAG() {
|
|
|
|
// Skip peepholes at -O0.
|
|
|
|
if (TM.getOptLevel() == CodeGenOpt::None)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Attempt to remove vectors moves that were inserted to zero upper bits.
|
|
|
|
|
|
|
|
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
|
|
|
++Position;
|
|
|
|
|
|
|
|
while (Position != CurDAG->allnodes_begin()) {
|
|
|
|
SDNode *N = &*--Position;
|
|
|
|
// Skip dead nodes and any non-machine opcodes.
|
|
|
|
if (N->use_empty() || !N->isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned SubRegIdx = N->getConstantOperandVal(2);
|
|
|
|
if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SDValue Move = N->getOperand(1);
|
|
|
|
if (!Move.isMachineOpcode())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Make sure its one of the move opcodes we recognize.
|
|
|
|
switch (Move.getMachineOpcode()) {
|
|
|
|
default:
|
|
|
|
continue;
|
|
|
|
case X86::VMOVAPDrr: case X86::VMOVUPDrr:
|
|
|
|
case X86::VMOVAPSrr: case X86::VMOVUPSrr:
|
|
|
|
case X86::VMOVDQArr: case X86::VMOVDQUrr:
|
|
|
|
case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
|
|
|
|
case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
|
|
|
|
case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
|
|
|
|
case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
|
|
|
|
case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
|
|
|
|
case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
|
|
|
|
case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
|
|
|
|
case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
|
|
|
|
case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
|
|
|
|
case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
|
|
|
|
case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue In = Move.getOperand(0);
|
|
|
|
if (!In.isMachineOpcode() ||
|
|
|
|
In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Producing instruction is another vector instruction. We can drop the
|
|
|
|
// move.
|
|
|
|
CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
|
|
|
|
|
|
|
|
// If the move is now dead, delete it.
|
|
|
|
if (Move.getNode()->use_empty())
|
|
|
|
CurDAG->RemoveDeadNode(Move.getNode());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Emit any code that needs to be executed only in the main function.
|
2015-10-14 00:23:00 +08:00
|
|
|
void X86DAGToDAGISel::emitSpecialCodeForMain() {
|
2011-01-06 08:47:10 +08:00
|
|
|
if (Subtarget->isTargetCygMing()) {
|
2015-02-21 13:49:45 +08:00
|
|
|
TargetLowering::ArgListTy Args;
|
2015-07-09 10:09:04 +08:00
|
|
|
auto &DL = CurDAG->getDataLayout();
|
2015-02-21 13:49:45 +08:00
|
|
|
|
|
|
|
TargetLowering::CallLoweringInfo CLI(*CurDAG);
|
|
|
|
CLI.setChain(CurDAG->getRoot())
|
|
|
|
.setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
|
2015-07-09 10:09:04 +08:00
|
|
|
CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
|
2016-06-22 20:54:25 +08:00
|
|
|
std::move(Args));
|
2015-02-21 13:49:45 +08:00
|
|
|
const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
|
|
|
|
std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
|
|
|
|
CurDAG->setRoot(Result.second);
|
2011-01-06 08:47:10 +08:00
|
|
|
}
|
2007-09-26 05:52:30 +08:00
|
|
|
}
|
|
|
|
|
2010-04-15 04:17:22 +08:00
|
|
|
void X86DAGToDAGISel::EmitFunctionEntryCode() {
|
2007-09-26 05:52:30 +08:00
|
|
|
// If this is main, emit special code for main.
|
2017-12-16 06:22:58 +08:00
|
|
|
const Function &F = MF->getFunction();
|
|
|
|
if (F.hasExternalLinkage() && F.getName() == "main")
|
|
|
|
emitSpecialCodeForMain();
|
2007-09-26 05:52:30 +08:00
|
|
|
}
|
|
|
|
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool isDispSafeForFrameIndex(int64_t Val) {
|
2011-07-14 05:29:53 +08:00
|
|
|
// On 64-bit platforms, we can run into an issue where a frame index
|
|
|
|
// includes a displacement that, when added to the explicit displacement,
|
|
|
|
// will overflow the displacement field. Assuming that the frame index
|
|
|
|
// displacement fits into a 31-bit integer (which is only slightly more
|
|
|
|
// aggressive than the current fundamental assumption that it fits into
|
|
|
|
// a 32-bit integer), a 31-bit disp should always be safe.
|
|
|
|
return isInt<31>(Val);
|
|
|
|
}
|
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
|
2011-07-14 04:44:23 +08:00
|
|
|
X86ISelAddressMode &AM) {
|
2018-05-22 05:03:19 +08:00
|
|
|
// If there's no offset to fold, we don't need to do any work.
|
|
|
|
if (Offset == 0)
|
|
|
|
return false;
|
|
|
|
|
2015-05-05 07:22:36 +08:00
|
|
|
// Cannot combine ExternalSymbol displacements with integer offsets.
|
2018-05-22 05:03:19 +08:00
|
|
|
if (AM.ES || AM.MCSym)
|
2015-05-05 07:22:36 +08:00
|
|
|
return true;
|
2018-05-22 05:03:19 +08:00
|
|
|
|
2011-07-14 04:44:23 +08:00
|
|
|
int64_t Val = AM.Disp + Offset;
|
|
|
|
CodeModel::Model M = TM.getCodeModel();
|
2011-07-14 05:29:53 +08:00
|
|
|
if (Subtarget->is64Bit()) {
|
|
|
|
if (!X86::isOffsetSuitableForCodeModel(Val, M,
|
|
|
|
AM.hasSymbolicDisplacement()))
|
|
|
|
return true;
|
|
|
|
// In addition to the checks required for a register base, check that
|
|
|
|
// we do not try to use an unsafe Disp with a frame index.
|
|
|
|
if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
|
|
|
|
!isDispSafeForFrameIndex(Val))
|
|
|
|
return true;
|
2011-07-14 04:44:23 +08:00
|
|
|
}
|
2011-07-14 05:29:53 +08:00
|
|
|
AM.Disp = Val;
|
|
|
|
return false;
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2011-07-14 04:44:23 +08:00
|
|
|
}
|
2009-04-09 05:14:34 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
|
2010-09-22 12:39:11 +08:00
|
|
|
SDValue Address = N->getOperand(1);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2010-09-22 12:39:11 +08:00
|
|
|
// load gs:0 -> GS segment register.
|
|
|
|
// load fs:0 -> FS segment register.
|
|
|
|
//
|
2009-04-09 05:14:34 +08:00
|
|
|
// This optimization is valid because the GNU TLS model defines that
|
|
|
|
// gs:0 (or fs:0 on X86-64) contains its own address.
|
|
|
|
// For more information see http://people.redhat.com/drepper/tls.pdf
|
2010-09-22 12:39:11 +08:00
|
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
|
2014-04-25 13:30:21 +08:00
|
|
|
if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
|
2017-02-24 11:10:10 +08:00
|
|
|
(Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
|
|
|
|
Subtarget->isTargetFuchsia()))
|
2010-09-22 12:39:11 +08:00
|
|
|
switch (N->getPointerInfo().getAddrSpace()) {
|
|
|
|
case 256:
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
|
|
|
|
return false;
|
|
|
|
case 257:
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
|
|
|
|
return false;
|
2016-05-04 04:16:08 +08:00
|
|
|
// Address space 258 is not handled here, because it is not used to
|
|
|
|
// address TLS areas.
|
2010-09-22 12:39:11 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2009-04-09 05:14:34 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
|
|
|
|
/// mode. These wrap things that will resolve down into a symbol reference.
|
|
|
|
/// If no match is possible, this returns true, otherwise it returns false.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
// If the addressing mode already has a symbol as the displacement, we can
|
|
|
|
// never match another symbol.
|
2009-04-13 05:55:03 +08:00
|
|
|
if (AM.hasSymbolicDisplacement())
|
|
|
|
return true;
|
|
|
|
|
2018-05-22 05:03:19 +08:00
|
|
|
bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
|
|
|
|
|
|
|
|
// Only do this address mode folding for 64-bit if we're in the small code
|
|
|
|
// model.
|
|
|
|
// FIXME: But we can do GOTPCREL addressing in the medium code model.
|
2009-08-06 07:01:26 +08:00
|
|
|
CodeModel::Model M = TM.getCodeModel();
|
2018-05-22 05:03:19 +08:00
|
|
|
if (Subtarget->is64Bit() && M != CodeModel::Small && M != CodeModel::Kernel)
|
|
|
|
return true;
|
2009-08-06 07:01:26 +08:00
|
|
|
|
2018-05-22 05:03:19 +08:00
|
|
|
// Base and index reg must be 0 in order to use %rip as base.
|
|
|
|
if (IsRIPRel && AM.hasBaseOrIndexReg())
|
|
|
|
return true;
|
2009-08-06 07:01:26 +08:00
|
|
|
|
2018-05-22 05:03:19 +08:00
|
|
|
// Make a local copy in case we can't do this fold.
|
|
|
|
X86ISelAddressMode Backup = AM;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
|
2018-05-22 05:03:19 +08:00
|
|
|
int64_t Offset = 0;
|
|
|
|
SDValue N0 = N.getOperand(0);
|
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
|
|
|
|
AM.GV = G->getGlobal();
|
|
|
|
AM.SymbolFlags = G->getTargetFlags();
|
|
|
|
Offset = G->getOffset();
|
|
|
|
} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
|
|
|
|
AM.CP = CP->getConstVal();
|
|
|
|
AM.Align = CP->getAlignment();
|
|
|
|
AM.SymbolFlags = CP->getTargetFlags();
|
|
|
|
Offset = CP->getOffset();
|
|
|
|
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
|
|
|
|
AM.ES = S->getSymbol();
|
|
|
|
AM.SymbolFlags = S->getTargetFlags();
|
|
|
|
} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
|
|
|
|
AM.MCSym = S->getMCSymbol();
|
|
|
|
} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
|
|
|
|
AM.JT = J->getIndex();
|
|
|
|
AM.SymbolFlags = J->getTargetFlags();
|
|
|
|
} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
|
|
|
|
AM.BlockAddr = BA->getBlockAddress();
|
|
|
|
AM.SymbolFlags = BA->getTargetFlags();
|
|
|
|
Offset = BA->getOffset();
|
|
|
|
} else
|
|
|
|
llvm_unreachable("Unhandled symbol reference node.");
|
|
|
|
|
|
|
|
if (foldOffsetIntoAddress(Offset, AM)) {
|
|
|
|
AM = Backup;
|
|
|
|
return true;
|
2009-04-13 05:55:03 +08:00
|
|
|
}
|
|
|
|
|
2018-05-22 05:03:19 +08:00
|
|
|
if (IsRIPRel)
|
|
|
|
AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
|
|
|
|
|
|
|
|
// Commit the changes now that we know this fold is safe.
|
|
|
|
return false;
|
2009-04-13 05:55:03 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Add the specified node to the specified addressing mode, returning true if
|
|
|
|
/// it cannot be done. This just pattern matches for the addressing mode.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
|
|
|
|
if (matchAddressRecursively(N, AM, 0))
|
2009-07-23 07:26:55 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
|
|
|
|
// a smaller encoding and avoids a scaled-index.
|
|
|
|
if (AM.Scale == 2 &&
|
|
|
|
AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr) {
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = AM.IndexReg;
|
2009-07-23 07:26:55 +08:00
|
|
|
AM.Scale = 1;
|
|
|
|
}
|
|
|
|
|
2009-08-21 02:23:44 +08:00
|
|
|
// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
|
|
|
|
// because it has a smaller encoding.
|
|
|
|
// TODO: Which other code models can use this?
|
|
|
|
if (TM.getCodeModel() == CodeModel::Small &&
|
|
|
|
Subtarget->is64Bit() &&
|
|
|
|
AM.Scale == 1 &&
|
|
|
|
AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr &&
|
|
|
|
AM.IndexReg.getNode() == nullptr &&
|
2009-08-26 01:47:44 +08:00
|
|
|
AM.SymbolFlags == X86II::MO_NO_FLAG &&
|
2009-08-21 02:23:44 +08:00
|
|
|
AM.hasSymbolicDisplacement())
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
|
2009-08-21 02:23:44 +08:00
|
|
|
|
2009-07-23 07:26:55 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-22 02:56:06 +08:00
|
|
|
bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
|
|
|
|
unsigned Depth) {
|
|
|
|
// Add an artificial use to this node so that we can keep track of
|
|
|
|
// it if it gets CSE'd with a different node.
|
|
|
|
HandleSDNode Handle(N);
|
|
|
|
|
|
|
|
X86ISelAddressMode Backup = AM;
|
|
|
|
if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
|
|
|
|
!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
|
|
|
|
return false;
|
|
|
|
AM = Backup;
|
|
|
|
|
|
|
|
// Try again after commuting the operands.
|
|
|
|
if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
|
|
|
|
!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
|
|
|
|
return false;
|
|
|
|
AM = Backup;
|
|
|
|
|
|
|
|
// If we couldn't fold both operands into the address at the same time,
|
|
|
|
// see if we can just put each operand into a register and fold at least
|
|
|
|
// the add.
|
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase &&
|
|
|
|
!AM.Base_Reg.getNode() &&
|
|
|
|
!AM.IndexReg.getNode()) {
|
|
|
|
N = Handle.getValue();
|
|
|
|
AM.Base_Reg = N.getOperand(0);
|
|
|
|
AM.IndexReg = N.getOperand(1);
|
|
|
|
AM.Scale = 1;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
N = Handle.getValue();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-01-11 19:04:36 +08:00
|
|
|
// Insert a node into the DAG at least before the Pos node's position. This
|
|
|
|
// will reposition the node as needed, and will assign it a node ID that is <=
|
|
|
|
// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
|
|
|
|
// IDs! The selection DAG must no longer depend on their uniqueness when this
|
|
|
|
// is used.
|
2016-04-06 04:45:04 +08:00
|
|
|
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
|
2018-03-23 03:32:07 +08:00
|
|
|
if (N->getNodeId() == -1 ||
|
|
|
|
(SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
|
|
|
|
SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
|
|
|
|
DAG.RepositionNode(Pos->getIterator(), N.getNode());
|
|
|
|
// Mark Node as invalid for pruning as after this it may be a successor to a
|
|
|
|
// selected node but otherwise be in the same position of Pos.
|
|
|
|
// Conservatively mark it with the same -abs(Id) to assure node id
|
|
|
|
// invariant is preserved.
|
|
|
|
N->setNodeId(Pos->getNodeId());
|
|
|
|
SelectionDAGISel::InvalidateNodeId(N.getNode());
|
2012-01-11 19:04:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-17 01:14:10 +08:00
|
|
|
// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
|
|
|
|
// safe. This allows us to convert the shift and and into an h-register
|
|
|
|
// extract and a scaled index. Returns false if the simplification is
|
|
|
|
// performed.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
|
|
|
|
uint64_t Mask,
|
|
|
|
SDValue Shift, SDValue X,
|
|
|
|
X86ISelAddressMode &AM) {
|
2012-01-11 16:48:20 +08:00
|
|
|
if (Shift.getOpcode() != ISD::SRL ||
|
|
|
|
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
|
|
|
|
!Shift.hasOneUse())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
int ScaleLog = 8 - Shift.getConstantOperandVal(1);
|
|
|
|
if (ScaleLog <= 0 || ScaleLog >= 4 ||
|
|
|
|
Mask != (0xffu << ScaleLog))
|
|
|
|
return true;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(N);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
|
|
|
|
SDValue NewMask = DAG.getConstant(0xff, DL, VT);
|
2012-01-11 16:48:20 +08:00
|
|
|
SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
|
|
|
|
SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
|
2012-01-11 16:48:20 +08:00
|
|
|
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
|
|
|
|
|
2012-01-12 09:34:44 +08:00
|
|
|
// Insert the new nodes into the topological ordering. We must do this in
|
|
|
|
// a valid topological ordering as nothing is going to go back and re-sort
|
|
|
|
// these nodes. We continually insert before 'N' in sequence as this is
|
|
|
|
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
|
|
|
|
// hierarchy left to express.
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, Eight);
|
|
|
|
insertDAGNode(DAG, N, Srl);
|
|
|
|
insertDAGNode(DAG, N, NewMask);
|
|
|
|
insertDAGNode(DAG, N, And);
|
|
|
|
insertDAGNode(DAG, N, ShlCount);
|
|
|
|
insertDAGNode(DAG, N, Shl);
|
2012-01-11 16:48:20 +08:00
|
|
|
DAG.ReplaceAllUsesWith(N, Shl);
|
|
|
|
AM.IndexReg = And;
|
|
|
|
AM.Scale = (1 << ScaleLog);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-01-11 17:35:00 +08:00
|
|
|
// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
|
|
|
|
// allows us to fold the shift into this addressing mode. Returns false if the
|
|
|
|
// transform succeeded.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
|
|
|
|
uint64_t Mask,
|
|
|
|
SDValue Shift, SDValue X,
|
|
|
|
X86ISelAddressMode &AM) {
|
2012-01-11 17:35:00 +08:00
|
|
|
if (Shift.getOpcode() != ISD::SHL ||
|
|
|
|
!isa<ConstantSDNode>(Shift.getOperand(1)))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Not likely to be profitable if either the AND or SHIFT node has more
|
|
|
|
// than one use (unless all uses are for address computation). Besides,
|
|
|
|
// isel mechanism requires their node ids to be reused.
|
|
|
|
if (!N.hasOneUse() || !Shift.hasOneUse())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Verify that the shift amount is something we can fold.
|
|
|
|
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
|
|
|
|
if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
|
|
|
|
return true;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(N);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
|
2012-01-11 17:35:00 +08:00
|
|
|
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
|
|
|
|
SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
|
|
|
|
|
2012-01-12 09:34:44 +08:00
|
|
|
// Insert the new nodes into the topological ordering. We must do this in
|
|
|
|
// a valid topological ordering as nothing is going to go back and re-sort
|
|
|
|
// these nodes. We continually insert before 'N' in sequence as this is
|
|
|
|
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
|
|
|
|
// hierarchy left to express.
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, NewMask);
|
|
|
|
insertDAGNode(DAG, N, NewAnd);
|
|
|
|
insertDAGNode(DAG, N, NewShift);
|
2012-01-11 17:35:00 +08:00
|
|
|
DAG.ReplaceAllUsesWith(N, NewShift);
|
|
|
|
|
|
|
|
AM.Scale = 1 << ShiftAmt;
|
|
|
|
AM.IndexReg = NewAnd;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-01-11 16:41:08 +08:00
|
|
|
// Implement some heroics to detect shifts of masked values where the mask can
|
|
|
|
// be replaced by extending the shift and undoing that in the addressing mode
|
|
|
|
// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
|
|
|
|
// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
|
|
|
|
// the addressing mode. This results in code such as:
|
|
|
|
//
|
|
|
|
// int f(short *y, int *lookup_table) {
|
|
|
|
// ...
|
|
|
|
// return *y + lookup_table[*y >> 11];
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// Turning into:
|
|
|
|
// movzwl (%rdi), %eax
|
|
|
|
// movl %eax, %ecx
|
|
|
|
// shrl $11, %ecx
|
|
|
|
// addl (%rsi,%rcx,4), %eax
|
|
|
|
//
|
|
|
|
// Instead of:
|
|
|
|
// movzwl (%rdi), %eax
|
|
|
|
// movl %eax, %ecx
|
|
|
|
// shrl $9, %ecx
|
|
|
|
// andl $124, %rcx
|
|
|
|
// addl (%rsi,%rcx), %eax
|
|
|
|
//
|
2012-01-11 17:35:02 +08:00
|
|
|
// Note that this function assumes the mask is provided as a mask *after* the
|
|
|
|
// value is shifted. The input chain may or may not match that, but computing
|
|
|
|
// such a mask is trivial.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
|
|
|
|
uint64_t Mask,
|
|
|
|
SDValue Shift, SDValue X,
|
|
|
|
X86ISelAddressMode &AM) {
|
2012-01-11 17:35:02 +08:00
|
|
|
if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
|
|
|
|
!isa<ConstantSDNode>(Shift.getOperand(1)))
|
2012-01-11 16:41:08 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
|
2013-05-25 06:23:49 +08:00
|
|
|
unsigned MaskLZ = countLeadingZeros(Mask);
|
|
|
|
unsigned MaskTZ = countTrailingZeros(Mask);
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// The amount of shift we're trying to fit into the addressing mode is taken
|
2012-01-11 17:35:02 +08:00
|
|
|
// from the trailing zeros of the mask.
|
|
|
|
unsigned AMShiftAmt = MaskTZ;
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// There is nothing we can do here unless the mask is removing some bits.
|
|
|
|
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
|
|
|
|
if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
|
|
|
|
|
|
|
|
// We also need to ensure that mask is a continuous run of bits.
|
2015-02-12 23:35:40 +08:00
|
|
|
if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// Scale the leading zero count down based on the actual size of the value.
|
2012-01-11 17:35:02 +08:00
|
|
|
// Also scale it down based on the size of the shift.
|
2017-07-20 02:09:46 +08:00
|
|
|
unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
|
|
|
|
if (MaskLZ < ScaleDown)
|
|
|
|
return true;
|
|
|
|
MaskLZ -= ScaleDown;
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// The final check is to ensure that any masked out high bits of X are
|
|
|
|
// already known to be zero. Otherwise, the mask has a semantic impact
|
|
|
|
// other than masking out a couple of low bits. Unfortunately, because of
|
|
|
|
// the mask, zero extensions will be removed from operands in some cases.
|
|
|
|
// This code works extra hard to look through extensions because we can
|
|
|
|
// replace them with zero extensions cheaply if necessary.
|
|
|
|
bool ReplacingAnyExtend = false;
|
|
|
|
if (X.getOpcode() == ISD::ANY_EXTEND) {
|
2013-08-15 13:57:07 +08:00
|
|
|
unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
|
|
|
|
X.getOperand(0).getSimpleValueType().getSizeInBits();
|
2012-01-11 16:41:08 +08:00
|
|
|
// Assume that we'll replace the any-extend with a zero-extend, and
|
|
|
|
// narrow the search to the extended value.
|
|
|
|
X = X.getOperand(0);
|
|
|
|
MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
|
|
|
|
ReplacingAnyExtend = true;
|
|
|
|
}
|
2013-08-15 13:57:07 +08:00
|
|
|
APInt MaskedHighBits =
|
|
|
|
APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
|
2017-04-28 13:31:46 +08:00
|
|
|
KnownBits Known;
|
|
|
|
DAG.computeKnownBits(X, Known);
|
|
|
|
if (MaskedHighBits != Known.Zero) return true;
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// We've identified a pattern that can be transformed into a single shift
|
|
|
|
// and an addressing mode. Make it so.
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2012-01-11 16:41:08 +08:00
|
|
|
if (ReplacingAnyExtend) {
|
|
|
|
assert(X.getValueType() != VT);
|
|
|
|
// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
|
2013-05-25 10:42:55 +08:00
|
|
|
SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, NewX);
|
2012-01-11 16:41:08 +08:00
|
|
|
X = NewX;
|
|
|
|
}
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(N);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
|
2012-01-11 16:41:08 +08:00
|
|
|
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
|
2012-01-11 16:41:08 +08:00
|
|
|
SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
|
2012-01-12 09:34:44 +08:00
|
|
|
|
|
|
|
// Insert the new nodes into the topological ordering. We must do this in
|
|
|
|
// a valid topological ordering as nothing is going to go back and re-sort
|
|
|
|
// these nodes. We continually insert before 'N' in sequence as this is
|
|
|
|
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
|
|
|
|
// hierarchy left to express.
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, NewSRLAmt);
|
|
|
|
insertDAGNode(DAG, N, NewSRL);
|
|
|
|
insertDAGNode(DAG, N, NewSHLAmt);
|
|
|
|
insertDAGNode(DAG, N, NewSHL);
|
2012-01-11 16:41:08 +08:00
|
|
|
DAG.ReplaceAllUsesWith(N, NewSHL);
|
|
|
|
|
|
|
|
AM.Scale = 1 << AMShiftAmt;
|
|
|
|
AM.IndexReg = NewSRL;
|
|
|
|
return false;
|
|
|
|
}
|
2017-12-02 06:20:26 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
|
2009-07-23 07:26:55 +08:00
|
|
|
unsigned Depth) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << "MatchAddress: ";
|
|
|
|
AM.dump(CurDAG);
|
|
|
|
});
|
2017-12-02 06:20:26 +08:00
|
|
|
// Limit recursion.
|
|
|
|
if (Depth > 5)
|
2015-10-14 00:23:00 +08:00
|
|
|
return matchAddressBase(N, AM);
|
2009-08-06 07:01:26 +08:00
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
// If this is already a %rip relative address, we can only merge immediates
|
|
|
|
// into it. Instead of handling this in every case, we handle it here.
|
2006-09-08 14:48:29 +08:00
|
|
|
// RIP relative addressing: %rip + 32-bit displacement!
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (AM.isRIPRelative()) {
|
|
|
|
// FIXME: JumpTable and ExternalSymbol address currently don't like
|
|
|
|
// displacements. It isn't very important, but this should be fixed for
|
|
|
|
// consistency.
|
2015-06-23 01:46:53 +08:00
|
|
|
if (!(AM.ES || AM.MCSym) && AM.JT != -1)
|
|
|
|
return true;
|
2009-08-06 07:01:26 +08:00
|
|
|
|
2011-07-14 04:44:23 +08:00
|
|
|
if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
|
2006-09-08 14:48:29 +08:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2005-11-19 10:11:08 +08:00
|
|
|
switch (N.getOpcode()) {
|
|
|
|
default: break;
|
Rename llvm.frameescape and llvm.framerecover to localescape and localrecover
Summary:
Initially, these intrinsics seemed like part of a family of "frame"
related intrinsics, but now I think that's more confusing than helpful.
Initially, the LangRef specified that this would create a new kind of
allocation that would be allocated at a fixed offset from the frame
pointer (EBP/RBP). We ended up dropping that design, and leaving the
stack frame layout alone.
These intrinsics are really about sharing local stack allocations, not
frame pointers. I intend to go further and add an `llvm.localaddress()`
intrinsic that returns whatever register (EBP, ESI, ESP, RBX) is being
used to address locals, which should not be confused with the frame
pointer.
Naming suggestions at this point are welcome, I'm happy to re-run sed.
Reviewers: majnemer, nicholas
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D11011
llvm-svn: 241633
2015-07-08 06:25:32 +08:00
|
|
|
case ISD::LOCAL_RECOVER: {
|
2015-05-05 07:22:36 +08:00
|
|
|
if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
|
2015-06-23 01:46:53 +08:00
|
|
|
if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
|
|
|
|
// Use the symbol and don't prefix it.
|
|
|
|
AM.MCSym = ESNode->getMCSymbol();
|
|
|
|
return false;
|
|
|
|
}
|
2015-03-06 02:50:12 +08:00
|
|
|
break;
|
|
|
|
}
|
2006-09-08 14:48:29 +08:00
|
|
|
case ISD::Constant: {
|
2008-11-11 23:52:29 +08:00
|
|
|
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldOffsetIntoAddress(Val, AM))
|
2006-09-08 14:48:29 +08:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
}
|
2005-12-08 10:01:35 +08:00
|
|
|
|
2009-04-13 05:55:03 +08:00
|
|
|
case X86ISD::Wrapper:
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
case X86ISD::WrapperRIP:
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!matchWrapper(N, AM))
|
2009-04-13 05:55:03 +08:00
|
|
|
return false;
|
2005-12-08 10:01:35 +08:00
|
|
|
break;
|
|
|
|
|
2009-04-09 05:14:34 +08:00
|
|
|
case ISD::LOAD:
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
|
2009-04-09 05:14:34 +08:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
|
2006-02-25 18:09:08 +08:00
|
|
|
case ISD::FrameIndex:
|
2011-07-14 05:29:53 +08:00
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr &&
|
2011-07-14 05:29:53 +08:00
|
|
|
(!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
|
2006-02-25 18:09:08 +08:00
|
|
|
AM.BaseType = X86ISelAddressMode::FrameIndexBase;
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
|
2005-12-17 17:13:43 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
|
|
|
break;
|
2005-12-08 10:01:35 +08:00
|
|
|
|
2005-11-19 10:11:08 +08:00
|
|
|
case ISD::SHL:
|
2014-04-25 13:30:21 +08:00
|
|
|
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
|
2007-12-08 15:22:58 +08:00
|
|
|
break;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2017-05-12 21:08:45 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
2008-09-13 00:56:44 +08:00
|
|
|
unsigned Val = CN->getZExtValue();
|
2009-07-23 07:26:55 +08:00
|
|
|
// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
|
|
|
|
// that the base operand remains free for further matching. If
|
|
|
|
// the base doesn't end up getting used, a post-processing step
|
|
|
|
// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
|
2007-12-08 15:22:58 +08:00
|
|
|
if (Val == 1 || Val == 2 || Val == 3) {
|
|
|
|
AM.Scale = 1 << Val;
|
2017-05-12 21:08:45 +08:00
|
|
|
SDValue ShVal = N.getOperand(0);
|
2007-12-08 15:22:58 +08:00
|
|
|
|
|
|
|
// Okay, we know that we have a scale by now. However, if the scaled
|
|
|
|
// value is an add of something and a constant, we can fold the
|
|
|
|
// constant into the disp field here.
|
2011-02-14 06:25:43 +08:00
|
|
|
if (CurDAG->isBaseWithConstantOffset(ShVal)) {
|
2017-05-12 21:08:45 +08:00
|
|
|
AM.IndexReg = ShVal.getOperand(0);
|
|
|
|
ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
|
2012-08-25 07:29:28 +08:00
|
|
|
uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldOffsetIntoAddress(Disp, AM))
|
2011-07-14 04:44:23 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
2011-07-14 04:44:23 +08:00
|
|
|
|
|
|
|
AM.IndexReg = ShVal;
|
2007-12-08 15:22:58 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
2007-12-08 15:22:58 +08:00
|
|
|
}
|
2013-01-05 07:01:26 +08:00
|
|
|
break;
|
2005-12-08 10:01:35 +08:00
|
|
|
|
2012-01-11 17:35:02 +08:00
|
|
|
case ISD::SRL: {
|
|
|
|
// Scale must not be used already.
|
2014-04-25 13:30:21 +08:00
|
|
|
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
|
2012-01-11 17:35:02 +08:00
|
|
|
|
|
|
|
SDValue And = N.getOperand(0);
|
|
|
|
if (And.getOpcode() != ISD::AND) break;
|
|
|
|
SDValue X = And.getOperand(0);
|
|
|
|
|
|
|
|
// We only handle up to 64-bit values here as those are what matter for
|
|
|
|
// addressing mode optimizations.
|
2013-08-15 13:57:07 +08:00
|
|
|
if (X.getSimpleValueType().getSizeInBits() > 64) break;
|
2012-01-11 17:35:02 +08:00
|
|
|
|
|
|
|
// The mask used for the transform is expected to be post-shift, but we
|
|
|
|
// found the shift first so just apply the shift to the mask before passing
|
|
|
|
// it down.
|
|
|
|
if (!isa<ConstantSDNode>(N.getOperand(1)) ||
|
|
|
|
!isa<ConstantSDNode>(And.getOperand(1)))
|
|
|
|
break;
|
|
|
|
uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
|
|
|
|
|
2012-01-11 16:41:08 +08:00
|
|
|
// Try to fold the mask and shift into the scale, and return false if we
|
|
|
|
// succeed.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
|
2012-01-11 16:41:08 +08:00
|
|
|
return false;
|
|
|
|
break;
|
2012-01-11 17:35:02 +08:00
|
|
|
}
|
2012-01-11 16:41:08 +08:00
|
|
|
|
2007-10-23 04:22:24 +08:00
|
|
|
case ISD::SMUL_LOHI:
|
|
|
|
case ISD::UMUL_LOHI:
|
|
|
|
// A mul_lohi where we need the low part can be folded as a plain multiply.
|
2008-08-27 06:36:50 +08:00
|
|
|
if (N.getResNo() != 0) break;
|
2016-08-18 04:30:52 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
2005-11-19 10:11:08 +08:00
|
|
|
case ISD::MUL:
|
2009-03-31 05:36:47 +08:00
|
|
|
case X86ISD::MUL_IMM:
|
2005-11-19 10:11:08 +08:00
|
|
|
// X*[3,5,9] -> X+X*[2,4,8]
|
Eliminate the ISel priority queue, which used the topological order for a
priority function. Instead, just iterate over the AllNodes list, which is
already in topological order. This eliminates a fair amount of bookkeeping,
and speeds up the isel phase by about 15% on many testcases.
The impact on most targets is that AddToISelQueue calls can be simply removed.
In the x86 target, there are two additional notable changes.
The rule-bending AND+SHIFT optimization in MatchAddress that creates new
pre-isel nodes during isel is now a little more verbose, but more robust.
Instead of either creating an invalid DAG or creating an invalid topological
sort, as it has historically done, it can now just insert the new nodes into
the node list at a position where they will be consistent with the topological
ordering.
Also, the address-matching code has logic that checked to see if a node was
"already selected". However, when a node is selected, it has all its uses
taken away via ReplaceAllUsesWith or equivalent, so it won't recieve any
further visits from MatchAddress. This code is now removed.
llvm-svn: 58748
2008-11-05 12:14:16 +08:00
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr &&
|
|
|
|
AM.IndexReg.getNode() == nullptr) {
|
2017-05-12 21:08:45 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
|
2008-09-13 00:56:44 +08:00
|
|
|
if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
|
|
|
|
CN->getZExtValue() == 9) {
|
|
|
|
AM.Scale = unsigned(CN->getZExtValue())-1;
|
2005-11-19 10:11:08 +08:00
|
|
|
|
2017-05-12 21:08:45 +08:00
|
|
|
SDValue MulVal = N.getOperand(0);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Reg;
|
2005-11-19 10:11:08 +08:00
|
|
|
|
|
|
|
// Okay, we know that we have a scale by now. However, if the scaled
|
|
|
|
// value is an add of something and a constant, we can fold the
|
|
|
|
// constant into the disp field here.
|
2008-08-29 05:40:38 +08:00
|
|
|
if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
|
2017-05-12 21:08:45 +08:00
|
|
|
isa<ConstantSDNode>(MulVal.getOperand(1))) {
|
|
|
|
Reg = MulVal.getOperand(0);
|
2005-11-19 10:11:08 +08:00
|
|
|
ConstantSDNode *AddVal =
|
2017-05-12 21:08:45 +08:00
|
|
|
cast<ConstantSDNode>(MulVal.getOperand(1));
|
2011-07-14 04:44:23 +08:00
|
|
|
uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
|
2015-10-14 00:23:00 +08:00
|
|
|
if (foldOffsetIntoAddress(Disp, AM))
|
2017-05-12 21:08:45 +08:00
|
|
|
Reg = N.getOperand(0);
|
2005-11-19 10:11:08 +08:00
|
|
|
} else {
|
2017-05-12 21:08:45 +08:00
|
|
|
Reg = N.getOperand(0);
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
|
|
|
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.IndexReg = AM.Base_Reg = Reg;
|
2005-11-19 10:11:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
2007-02-05 04:18:17 +08:00
|
|
|
}
|
2005-11-19 10:11:08 +08:00
|
|
|
break;
|
|
|
|
|
2009-05-12 02:02:53 +08:00
|
|
|
case ISD::SUB: {
|
|
|
|
// Given A-B, if A can be completely folded into the address and
|
|
|
|
// the index field with the index field unused, use -B as the index.
|
|
|
|
// This is a win if a has multiple parts that can be folded into
|
|
|
|
// the address. Also, this saves a mov if the base register has
|
|
|
|
// other uses, since it avoids a two-address sub instruction, however
|
|
|
|
// it costs an additional mov if the index register has other uses.
|
|
|
|
|
2010-06-18 09:24:29 +08:00
|
|
|
// Add an artificial use to this node so that we can keep track of
|
|
|
|
// it if it gets CSE'd with a different node.
|
|
|
|
HandleSDNode Handle(N);
|
|
|
|
|
2009-05-12 02:02:53 +08:00
|
|
|
// Test if the LHS of the sub can be folded.
|
|
|
|
X86ISelAddressMode Backup = AM;
|
2017-05-12 21:08:45 +08:00
|
|
|
if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
|
2009-05-12 02:02:53 +08:00
|
|
|
AM = Backup;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Test if the index field is free for use.
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
|
2009-05-12 02:02:53 +08:00
|
|
|
AM = Backup;
|
|
|
|
break;
|
|
|
|
}
|
2010-03-18 07:58:35 +08:00
|
|
|
|
2009-05-12 02:02:53 +08:00
|
|
|
int Cost = 0;
|
2017-05-12 21:08:45 +08:00
|
|
|
SDValue RHS = Handle.getValue().getOperand(1);
|
2009-05-12 02:02:53 +08:00
|
|
|
// If the RHS involves a register with multiple uses, this
|
|
|
|
// transformation incurs an extra mov, due to the neg instruction
|
|
|
|
// clobbering its operand.
|
|
|
|
if (!RHS.getNode()->hasOneUse() ||
|
|
|
|
RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
|
|
|
|
RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
|
|
|
|
RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
|
|
|
|
(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
|
2017-05-12 21:08:45 +08:00
|
|
|
RHS.getOperand(0).getValueType() == MVT::i32))
|
2009-05-12 02:02:53 +08:00
|
|
|
++Cost;
|
|
|
|
// If the base is a register with multiple uses, this
|
|
|
|
// transformation may save a mov.
|
2017-04-21 02:29:14 +08:00
|
|
|
// FIXME: Don't rely on DELETED_NODEs.
|
|
|
|
if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
|
|
|
|
AM.Base_Reg->getOpcode() != ISD::DELETED_NODE &&
|
2010-04-30 07:30:41 +08:00
|
|
|
!AM.Base_Reg.getNode()->hasOneUse()) ||
|
2009-05-12 02:02:53 +08:00
|
|
|
AM.BaseType == X86ISelAddressMode::FrameIndexBase)
|
|
|
|
--Cost;
|
|
|
|
// If the folded LHS was interesting, this transformation saves
|
|
|
|
// address arithmetic.
|
|
|
|
if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
|
|
|
|
((AM.Disp != 0) && (Backup.Disp == 0)) +
|
|
|
|
(AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
|
|
|
|
--Cost;
|
|
|
|
// If it doesn't look like it may be an overall win, don't do it.
|
|
|
|
if (Cost >= 0) {
|
|
|
|
AM = Backup;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ok, the transformation is legal and appears profitable. Go for it.
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
|
2009-05-12 02:02:53 +08:00
|
|
|
SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
|
|
|
|
AM.IndexReg = Neg;
|
|
|
|
AM.Scale = 1;
|
|
|
|
|
|
|
|
// Insert the new nodes into the topological ordering.
|
2017-03-24 02:25:17 +08:00
|
|
|
insertDAGNode(*CurDAG, Handle.getValue(), Zero);
|
|
|
|
insertDAGNode(*CurDAG, Handle.getValue(), Neg);
|
2009-05-12 02:02:53 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-22 02:56:06 +08:00
|
|
|
case ISD::ADD:
|
|
|
|
if (!matchAdd(N, AM, Depth))
|
2010-06-18 09:24:29 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
break;
|
2006-05-30 14:59:36 +08:00
|
|
|
|
2015-11-10 07:31:38 +08:00
|
|
|
case ISD::OR:
|
[x86] try harder to match bitwise 'or' into an LEA
The motivation for this patch starts with the epic fail example in PR18007:
https://llvm.org/bugs/show_bug.cgi?id=18007
...unfortunately, this patch makes no difference for that case, but it solves some
simpler cases. We'll get there some day. :)
The current 'or' matching code was using computeKnownBits() via
isBaseWithConstantOffset() -> MaskedValueIsZero(), but that's an unnecessarily limited use.
We can do more by copying the logic in ValueTracking's haveNoCommonBitsSet(), so we can
treat the 'or' as if it was an 'add'.
There's a TODO comment here because we should lift the bit-checking logic into a helper
function, so it's not duplicated in DAGCombiner.
An example of the better LEA matching:
leal (%rdi,%rdi), %eax
andl $1, %esi
orl %esi, %eax
Becomes:
andl $1, %esi
leal (%rsi,%rdi,2), %eax
Differential Revision: http://reviews.llvm.org/D13956
llvm-svn: 252515
2015-11-10 05:16:49 +08:00
|
|
|
// We want to look through a transform in InstCombine and DAGCombiner that
|
|
|
|
// turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
|
2015-11-10 07:31:38 +08:00
|
|
|
// Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
|
[x86] try harder to match bitwise 'or' into an LEA
The motivation for this patch starts with the epic fail example in PR18007:
https://llvm.org/bugs/show_bug.cgi?id=18007
...unfortunately, this patch makes no difference for that case, but it solves some
simpler cases. We'll get there some day. :)
The current 'or' matching code was using computeKnownBits() via
isBaseWithConstantOffset() -> MaskedValueIsZero(), but that's an unnecessarily limited use.
We can do more by copying the logic in ValueTracking's haveNoCommonBitsSet(), so we can
treat the 'or' as if it was an 'add'.
There's a TODO comment here because we should lift the bit-checking logic into a helper
function, so it's not duplicated in DAGCombiner.
An example of the better LEA matching:
leal (%rdi,%rdi), %eax
andl $1, %esi
orl %esi, %eax
Becomes:
andl $1, %esi
leal (%rsi,%rdi,2), %eax
Differential Revision: http://reviews.llvm.org/D13956
llvm-svn: 252515
2015-11-10 05:16:49 +08:00
|
|
|
// An 'lea' can then be used to match the shift (multiply) and add:
|
|
|
|
// and $1, %esi
|
|
|
|
// lea (%rsi, %rdi, 8), %rax
|
2015-11-10 07:31:38 +08:00
|
|
|
if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
|
|
|
|
!matchAdd(N, AM, Depth))
|
|
|
|
return false;
|
2006-05-30 14:59:36 +08:00
|
|
|
break;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2007-12-13 08:43:27 +08:00
|
|
|
case ISD::AND: {
|
Implement x86 h-register extract support.
- Add patterns for h-register extract, which avoids a shift and mask,
and in some cases a temporary register.
- Add address-mode matching for turning (X>>(8-n))&(255<<n), where
n is a valid address-mode scale value, into an h-register extract
and a scaled-offset address.
- Replace X86's MOV32to32_ and related instructions with the new
target-independent COPY_TO_SUBREG instruction.
On x86-64 there are complicated constraints on h registers, and
CodeGen doesn't currently provide a high-level way to express all of them,
so they are handled with a bunch of special code. This code currently only
supports extracts where the result is used by a zero-extend or a store,
though these are fairly common.
These transformations are not always beneficial; since there are only
4 h registers, they sometimes require extra move instructions, and
this sometimes increases register pressure because it can force out
values that would otherwise be in one of those registers. However,
this appears to be relatively uncommon.
llvm-svn: 68962
2009-04-14 00:09:41 +08:00
|
|
|
// Perform some heroic transforms on an and of a constant-count shift
|
|
|
|
// with a constant to enable use of the scaled offset field.
|
|
|
|
|
2007-12-13 08:43:27 +08:00
|
|
|
// Scale must not be used already.
|
2014-04-25 13:30:21 +08:00
|
|
|
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
|
Fix a x86-64 codegen deficiency. Allow gv + offset when using rip addressing mode.
Before:
_main:
subq $8, %rsp
leaq _X(%rip), %rax
movsd 8(%rax), %xmm1
movss _X(%rip), %xmm0
call _t
xorl %ecx, %ecx
movl %ecx, %eax
addq $8, %rsp
ret
Now:
_main:
subq $8, %rsp
movsd _X+8(%rip), %xmm1
movss _X(%rip), %xmm0
call _t
xorl %ecx, %ecx
movl %ecx, %eax
addq $8, %rsp
ret
Notice there is another idiotic codegen issue that needs to be fixed asap:
xorl %ecx, %ecx
movl %ecx, %eax
llvm-svn: 46850
2008-02-07 16:53:49 +08:00
|
|
|
|
2012-01-11 17:35:00 +08:00
|
|
|
SDValue Shift = N.getOperand(0);
|
|
|
|
if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
|
Implement x86 h-register extract support.
- Add patterns for h-register extract, which avoids a shift and mask,
and in some cases a temporary register.
- Add address-mode matching for turning (X>>(8-n))&(255<<n), where
n is a valid address-mode scale value, into an h-register extract
and a scaled-offset address.
- Replace X86's MOV32to32_ and related instructions with the new
target-independent COPY_TO_SUBREG instruction.
On x86-64 there are complicated constraints on h registers, and
CodeGen doesn't currently provide a high-level way to express all of them,
so they are handled with a bunch of special code. This code currently only
supports extracts where the result is used by a zero-extend or a store,
though these are fairly common.
These transformations are not always beneficial; since there are only
4 h registers, they sometimes require extra move instructions, and
this sometimes increases register pressure because it can force out
values that would otherwise be in one of those registers. However,
this appears to be relatively uncommon.
llvm-svn: 68962
2009-04-14 00:09:41 +08:00
|
|
|
SDValue X = Shift.getOperand(0);
|
2012-01-11 17:35:00 +08:00
|
|
|
|
|
|
|
// We only handle up to 64-bit values here as those are what matter for
|
|
|
|
// addressing mode optimizations.
|
2013-08-15 13:57:07 +08:00
|
|
|
if (X.getSimpleValueType().getSizeInBits() > 64) break;
|
2012-01-11 17:35:00 +08:00
|
|
|
|
2012-01-11 17:35:04 +08:00
|
|
|
if (!isa<ConstantSDNode>(N.getOperand(1)))
|
|
|
|
break;
|
|
|
|
uint64_t Mask = N.getConstantOperandVal(1);
|
2007-12-13 08:43:27 +08:00
|
|
|
|
2012-01-11 16:48:20 +08:00
|
|
|
// Try to fold the mask and shift into an extract and scale.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
|
2012-01-11 16:48:20 +08:00
|
|
|
return false;
|
Implement x86 h-register extract support.
- Add patterns for h-register extract, which avoids a shift and mask,
and in some cases a temporary register.
- Add address-mode matching for turning (X>>(8-n))&(255<<n), where
n is a valid address-mode scale value, into an h-register extract
and a scaled-offset address.
- Replace X86's MOV32to32_ and related instructions with the new
target-independent COPY_TO_SUBREG instruction.
On x86-64 there are complicated constraints on h registers, and
CodeGen doesn't currently provide a high-level way to express all of them,
so they are handled with a bunch of special code. This code currently only
supports extracts where the result is used by a zero-extend or a store,
though these are fairly common.
These transformations are not always beneficial; since there are only
4 h registers, they sometimes require extra move instructions, and
this sometimes increases register pressure because it can force out
values that would otherwise be in one of those registers. However,
this appears to be relatively uncommon.
llvm-svn: 68962
2009-04-14 00:09:41 +08:00
|
|
|
|
2012-01-11 16:48:20 +08:00
|
|
|
// Try to fold the mask and shift directly into the scale.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
|
2012-01-11 16:41:08 +08:00
|
|
|
return false;
|
|
|
|
|
2012-01-11 17:35:00 +08:00
|
|
|
// Try to swap the mask and shift to place shifts which can be done as
|
|
|
|
// a scale on the outside of the mask.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
|
2012-01-11 17:35:00 +08:00
|
|
|
return false;
|
|
|
|
break;
|
2007-12-13 08:43:27 +08:00
|
|
|
}
|
2006-05-30 14:59:36 +08:00
|
|
|
}
|
2005-11-19 10:11:08 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
return matchAddressBase(N, AM);
|
2007-08-14 04:03:06 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Helper for MatchAddress. Add the specified node to the
|
2007-08-14 04:03:06 +08:00
|
|
|
/// specified addressing mode without any further recursion.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
|
2005-11-19 10:11:08 +08:00
|
|
|
// Is the base register already occupied?
|
2010-04-30 07:30:41 +08:00
|
|
|
if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
|
2005-11-19 10:11:08 +08:00
|
|
|
// If so, check to see if the scale index register is set.
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!AM.IndexReg.getNode()) {
|
2005-11-19 10:11:08 +08:00
|
|
|
AM.IndexReg = N;
|
|
|
|
AM.Scale = 1;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise, we cannot select it.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Default, generate it as a register.
|
|
|
|
AM.BaseType = X86ISelAddressMode::RegBase;
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = N;
|
2005-11-19 10:11:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-11-14 01:53:59 +08:00
|
|
|
/// Helper for selectVectorAddr. Handles things that can be folded into a
|
|
|
|
/// gather scatter address. The index register and scale should have already
|
|
|
|
/// been handled.
|
|
|
|
bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
|
|
|
|
// TODO: Support other operations.
|
|
|
|
switch (N.getOpcode()) {
|
2018-01-11 03:16:05 +08:00
|
|
|
case ISD::Constant: {
|
|
|
|
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
|
|
|
|
if (!foldOffsetIntoAddress(Val, AM))
|
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
}
|
2017-11-14 01:53:59 +08:00
|
|
|
case X86ISD::Wrapper:
|
|
|
|
if (!matchWrapper(N, AM))
|
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return matchAddressBase(N, AM);
|
|
|
|
}
|
|
|
|
|
2017-11-11 03:26:04 +08:00
|
|
|
bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
|
|
|
SDValue &Scale, SDValue &Index,
|
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2017-11-14 01:53:59 +08:00
|
|
|
X86ISelAddressMode AM;
|
2017-11-22 16:10:54 +08:00
|
|
|
auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
|
|
|
|
AM.IndexReg = Mgs->getIndex();
|
2018-01-11 03:16:05 +08:00
|
|
|
AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
|
2017-11-11 03:26:04 +08:00
|
|
|
|
|
|
|
unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
|
2016-05-04 04:16:08 +08:00
|
|
|
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
|
2015-04-30 16:38:48 +08:00
|
|
|
if (AddrSpace == 256)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
|
|
|
|
if (AddrSpace == 257)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
|
2016-05-04 04:16:08 +08:00
|
|
|
if (AddrSpace == 258)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
|
2015-04-30 16:38:48 +08:00
|
|
|
|
2018-01-11 03:16:05 +08:00
|
|
|
// Try to match into the base and displacement fields.
|
|
|
|
if (matchVectorAddress(N, AM))
|
2017-11-14 01:53:59 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
MVT VT = N.getSimpleValueType();
|
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase) {
|
|
|
|
if (!AM.Base_Reg.getNode())
|
|
|
|
AM.Base_Reg = CurDAG->getRegister(0, VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
|
2015-04-30 16:38:48 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Returns true if it is able to pattern match an addressing mode.
|
2005-12-08 10:01:35 +08:00
|
|
|
/// It returns the operands which make up the maximal addressing mode it can
|
|
|
|
/// match by reference.
|
2010-09-22 06:07:31 +08:00
|
|
|
///
|
|
|
|
/// Parent is the parent node of the addr operand that is being matched. It
|
|
|
|
/// is always a load, store, atomic node, or null. It is only null when
|
|
|
|
/// checking memory operands for inline asm nodes.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Scale, SDValue &Index,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2005-12-08 10:01:35 +08:00
|
|
|
X86ISelAddressMode AM;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2010-09-22 06:07:31 +08:00
|
|
|
if (Parent &&
|
|
|
|
// This list of opcodes are all the nodes that have an "addr:$ptr" operand
|
|
|
|
// that are not a MemSDNode, and thus don't have proper addrspace info.
|
|
|
|
Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
|
2010-09-23 04:42:08 +08:00
|
|
|
Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
|
2012-10-16 06:39:43 +08:00
|
|
|
Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
|
|
|
|
Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
|
|
|
|
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
|
2010-09-22 06:07:31 +08:00
|
|
|
unsigned AddrSpace =
|
|
|
|
cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
|
2016-05-04 04:16:08 +08:00
|
|
|
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
|
2010-09-22 06:07:31 +08:00
|
|
|
if (AddrSpace == 256)
|
2010-09-22 12:39:11 +08:00
|
|
|
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
|
2010-09-22 06:07:31 +08:00
|
|
|
if (AddrSpace == 257)
|
2010-09-22 12:39:11 +08:00
|
|
|
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
|
2016-05-04 04:16:08 +08:00
|
|
|
if (AddrSpace == 258)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
|
2010-09-22 06:07:31 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
if (matchAddress(N, AM))
|
2010-09-22 12:39:11 +08:00
|
|
|
return false;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2010-09-22 12:39:11 +08:00
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase) {
|
|
|
|
if (!AM.Base_Reg.getNode())
|
|
|
|
AM.Base_Reg = CurDAG->getRegister(0, VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!AM.IndexReg.getNode())
|
|
|
|
AM.IndexReg = CurDAG->getRegister(0, VT);
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
|
2006-01-11 14:09:51 +08:00
|
|
|
return true;
|
2005-12-08 10:01:35 +08:00
|
|
|
}
|
|
|
|
|
2017-08-22 00:04:04 +08:00
|
|
|
// We can only fold a load if all nodes between it and the root node have a
|
|
|
|
// single use. If there are additional uses, we could end up duplicating the
|
|
|
|
// load.
|
|
|
|
static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
|
|
|
|
SDNode *User = *N->use_begin();
|
|
|
|
while (User != Root) {
|
|
|
|
if (!User->hasOneUse())
|
|
|
|
return false;
|
|
|
|
User = *User->use_begin();
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Match a scalar SSE load. In particular, we want to match a load whose top
|
|
|
|
/// elements are either undef or zeros. The load flavor is derived from the
|
|
|
|
/// type of N, which is either v4f32 or v2f64.
|
2010-02-17 14:07:47 +08:00
|
|
|
///
|
|
|
|
/// We also return:
|
2010-02-21 11:17:59 +08:00
|
|
|
/// PatternChainNode: this is the matched node that has a chain input and
|
|
|
|
/// output.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue N, SDValue &Base,
|
|
|
|
SDValue &Scale, SDValue &Index,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Disp, SDValue &Segment,
|
2010-02-21 11:17:59 +08:00
|
|
|
SDValue &PatternNodeWithChain) {
|
2016-12-12 15:57:24 +08:00
|
|
|
// We can allow a full vector load here since narrowing a load is ok.
|
|
|
|
if (ISD::isNON_EXTLoad(N.getNode())) {
|
|
|
|
PatternNodeWithChain = N;
|
|
|
|
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
2017-08-22 00:04:04 +08:00
|
|
|
IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
|
|
|
|
hasSingleUsesFromRoot(Root, N.getNode())) {
|
2016-12-12 15:57:24 +08:00
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
|
|
|
|
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We can also match the special zero extended load opcode.
|
|
|
|
if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
|
|
|
|
PatternNodeWithChain = N;
|
|
|
|
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
2017-08-22 00:04:04 +08:00
|
|
|
IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
|
|
|
|
hasSingleUsesFromRoot(Root, N.getNode())) {
|
2016-12-12 15:57:24 +08:00
|
|
|
auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
|
|
|
|
return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-27 01:29:25 +08:00
|
|
|
// Need to make sure that the SCALAR_TO_VECTOR and load are both only used
|
|
|
|
// once. Otherwise the load might get duplicated and the chain output of the
|
|
|
|
// duplicate load will not be observed by all dependencies.
|
|
|
|
if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
|
2010-02-21 11:17:59 +08:00
|
|
|
PatternNodeWithChain = N.getOperand(0);
|
|
|
|
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
|
2016-11-27 01:29:25 +08:00
|
|
|
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
2017-08-22 00:04:04 +08:00
|
|
|
IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
|
|
|
|
hasSingleUsesFromRoot(Root, N.getNode())) {
|
2010-02-21 11:17:59 +08:00
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
|
2016-11-27 02:43:21 +08:00
|
|
|
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
2006-10-08 05:55:32 +08:00
|
|
|
}
|
|
|
|
}
|
Fold "zero extending vector loads" now that evan added the chain manip stuff.
This compiles both tests in X86/vec_ss_load_fold.ll into:
_test1:
movss 4(%esp), %xmm0
subss LCPI1_0, %xmm0
mulss LCPI1_1, %xmm0
minss LCPI1_2, %xmm0
xorps %xmm1, %xmm1
maxss %xmm1, %xmm0
cvttss2si %xmm0, %eax
andl $65535, %eax
ret
instead of:
_test1:
movss LCPI1_0, %xmm0
movss 4(%esp), %xmm1
subss %xmm0, %xmm1
movss LCPI1_1, %xmm0
mulss %xmm0, %xmm1
movss LCPI1_2, %xmm0
minss %xmm0, %xmm1
xorps %xmm0, %xmm0
maxss %xmm0, %xmm1
cvttss2si %xmm1, %eax
andl $65535, %eax
ret
llvm-svn: 30894
2006-10-12 06:09:58 +08:00
|
|
|
|
|
|
|
// Also handle the case where we explicitly require zeros in the top
|
2006-10-08 05:55:32 +08:00
|
|
|
// elements. This is a vector shuffle from the zero vector.
|
2008-08-29 05:40:38 +08:00
|
|
|
if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
|
Fix a long standing deficiency in the X86 backend: we would
sometimes emit "zero" and "all one" vectors multiple times,
for example:
_test2:
pcmpeqd %mm0, %mm0
movq %mm0, _M1
pcmpeqd %mm0, %mm0
movq %mm0, _M2
ret
instead of:
_test2:
pcmpeqd %mm0, %mm0
movq %mm0, _M1
movq %mm0, _M2
ret
This patch fixes this by always arranging for zero/one vectors
to be defined as v4i32 or v2i32 (SSE/MMX) instead of letting them be
any random type. This ensures they get trivially CSE'd on the dag.
This fix is also important for LegalizeDAGTypes, as it gets unhappy
when the x86 backend wants BUILD_VECTOR(i64 0) to be legal even when
'i64' isn't legal.
This patch makes the following changes:
1) X86TargetLowering::LowerBUILD_VECTOR now lowers 0/1 vectors into
their canonical types.
2) The now-dead patterns are removed from the SSE/MMX .td files.
3) All the patterns in the .td file that referred to immAllOnesV or
immAllZerosV in the wrong form now use *_bc to match them with a
bitcast wrapped around them.
4) X86DAGToDAGISel::SelectScalarSSELoad is generalized to handle
bitcast'd zero vectors, which simplifies the code actually.
5) getShuffleVectorZeroOrUndef is updated to generate a shuffle that
is legal, instead of generating one that is illegal and expecting
a later legalize pass to clean it up.
6) isZeroShuffle is generalized to handle bitcast of zeros.
7) several other minor tweaks.
This patch is definite goodness, but has the potential to cause random
code quality regressions. Please be on the lookout for these and let
me know if they happen.
llvm-svn: 44310
2007-11-25 08:24:49 +08:00
|
|
|
// Check to see if the top elements are all zeros (or bitcast of zeros).
|
2012-08-02 02:39:17 +08:00
|
|
|
N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
2016-11-27 02:43:24 +08:00
|
|
|
N.getOperand(0).getNode()->hasOneUse()) {
|
|
|
|
PatternNodeWithChain = N.getOperand(0).getOperand(0);
|
|
|
|
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
|
|
|
|
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
2017-08-22 00:04:04 +08:00
|
|
|
IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
|
|
|
|
hasSingleUsesFromRoot(Root, N.getNode())) {
|
2016-11-27 02:43:24 +08:00
|
|
|
// Okay, this is a zero extending load. Fold it.
|
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
|
|
|
|
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
|
|
|
}
|
Fold "zero extending vector loads" now that evan added the chain manip stuff.
This compiles both tests in X86/vec_ss_load_fold.ll into:
_test1:
movss 4(%esp), %xmm0
subss LCPI1_0, %xmm0
mulss LCPI1_1, %xmm0
minss LCPI1_2, %xmm0
xorps %xmm1, %xmm1
maxss %xmm1, %xmm0
cvttss2si %xmm0, %eax
andl $65535, %eax
ret
instead of:
_test1:
movss LCPI1_0, %xmm0
movss 4(%esp), %xmm1
subss %xmm0, %xmm1
movss LCPI1_1, %xmm0
mulss %xmm0, %xmm1
movss LCPI1_2, %xmm0
minss %xmm0, %xmm1
xorps %xmm0, %xmm0
maxss %xmm0, %xmm1
cvttss2si %xmm1, %eax
andl $65535, %eax
ret
llvm-svn: 30894
2006-10-12 06:09:58 +08:00
|
|
|
}
|
2016-11-27 02:43:24 +08:00
|
|
|
|
2006-10-08 05:55:32 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
|
2013-06-01 17:55:14 +08:00
|
|
|
if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
|
|
|
|
uint64_t ImmVal = CN->getZExtValue();
|
2017-09-13 10:29:59 +08:00
|
|
|
if (!isUInt<32>(ImmVal))
|
2013-06-01 17:55:14 +08:00
|
|
|
return false;
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
|
2013-06-01 17:55:14 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// In static codegen with small code model, we can get the address of a label
|
|
|
|
// into a register with 'movl'. TableGen has already made sure we're looking
|
|
|
|
// at a label of some kind.
|
2013-06-11 04:43:49 +08:00
|
|
|
assert(N->getOpcode() == X86ISD::Wrapper &&
|
|
|
|
"Unexpected node type for MOV32ri64");
|
2013-06-01 17:55:14 +08:00
|
|
|
N = N.getOperand(0);
|
|
|
|
|
2016-11-17 05:48:59 +08:00
|
|
|
// At least GNU as does not accept 'movl' for TPOFF relocations.
|
|
|
|
// FIXME: We could use 'movl' when we know we are targeting MC.
|
|
|
|
if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
|
2013-06-01 17:55:14 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
Imm = N;
|
2016-12-09 03:01:00 +08:00
|
|
|
if (N->getOpcode() != ISD::TargetGlobalAddress)
|
|
|
|
return TM.getCodeModel() == CodeModel::Small;
|
|
|
|
|
|
|
|
Optional<ConstantRange> CR =
|
|
|
|
cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
|
|
|
|
if (!CR)
|
|
|
|
return TM.getCodeModel() == CodeModel::Small;
|
|
|
|
|
|
|
|
return CR->getUnsignedMax().ult(1ull << 32);
|
2013-06-01 17:55:14 +08:00
|
|
|
}
|
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
|
2013-06-11 04:43:49 +08:00
|
|
|
SDValue &Scale, SDValue &Index,
|
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2016-04-13 05:34:24 +08:00
|
|
|
// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
|
|
|
|
SDLoc DL(N);
|
2017-12-02 06:20:26 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
|
2013-06-11 04:43:49 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
|
|
|
|
if (RN && RN->getReg() == 0)
|
|
|
|
Base = CurDAG->getRegister(0, MVT::i64);
|
2014-08-20 19:59:22 +08:00
|
|
|
else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
|
2013-06-11 04:43:49 +08:00
|
|
|
// Base could already be %rip, particularly in the x32 ABI.
|
|
|
|
Base = SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(0, DL, MVT::i64),
|
2013-06-11 04:43:49 +08:00
|
|
|
Base,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
|
2013-06-11 04:43:49 +08:00
|
|
|
0);
|
|
|
|
}
|
|
|
|
|
|
|
|
RN = dyn_cast<RegisterSDNode>(Index);
|
|
|
|
if (RN && RN->getReg() == 0)
|
|
|
|
Index = CurDAG->getRegister(0, MVT::i64);
|
|
|
|
else {
|
|
|
|
assert(Index.getValueType() == MVT::i32 &&
|
|
|
|
"Expect to be extending 32-bit registers for use in LEA");
|
|
|
|
Index = SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(0, DL, MVT::i64),
|
2013-06-11 04:43:49 +08:00
|
|
|
Index,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(X86::sub_32bit, DL,
|
|
|
|
MVT::i32)),
|
2013-06-11 04:43:49 +08:00
|
|
|
0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Calls SelectAddr and determines if the maximal addressing
|
2006-02-25 18:09:08 +08:00
|
|
|
/// mode it matches can be cost effectively emitted as an LEA instruction.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
2006-02-25 18:09:08 +08:00
|
|
|
X86ISelAddressMode AM;
|
|
|
|
|
2016-04-13 05:34:24 +08:00
|
|
|
// Save the DL and VT before calling matchAddress, it can invalidate N.
|
|
|
|
SDLoc DL(N);
|
|
|
|
MVT VT = N.getSimpleValueType();
|
|
|
|
|
2009-04-10 18:09:34 +08:00
|
|
|
// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
|
|
|
|
// segments.
|
|
|
|
SDValue Copy = AM.Segment;
|
2009-08-12 04:47:22 +08:00
|
|
|
SDValue T = CurDAG->getRegister(0, MVT::i32);
|
2009-04-10 18:09:34 +08:00
|
|
|
AM.Segment = T;
|
2017-12-02 06:20:26 +08:00
|
|
|
if (matchAddress(N, AM))
|
2009-04-09 05:14:34 +08:00
|
|
|
return false;
|
2009-04-10 18:09:34 +08:00
|
|
|
assert (T == AM.Segment);
|
|
|
|
AM.Segment = Copy;
|
2009-04-09 05:14:34 +08:00
|
|
|
|
2006-02-25 18:09:08 +08:00
|
|
|
unsigned Complexity = 0;
|
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase)
|
2010-04-30 07:30:41 +08:00
|
|
|
if (AM.Base_Reg.getNode())
|
2006-02-25 18:09:08 +08:00
|
|
|
Complexity = 1;
|
|
|
|
else
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = CurDAG->getRegister(0, VT);
|
2006-02-25 18:09:08 +08:00
|
|
|
else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
|
|
|
|
Complexity = 4;
|
|
|
|
|
2008-08-29 05:40:38 +08:00
|
|
|
if (AM.IndexReg.getNode())
|
2006-02-25 18:09:08 +08:00
|
|
|
Complexity++;
|
|
|
|
else
|
2006-09-08 14:48:29 +08:00
|
|
|
AM.IndexReg = CurDAG->getRegister(0, VT);
|
2006-02-25 18:09:08 +08:00
|
|
|
|
Two changes:
1) codegen a shift of a register as a shift, not an LEA.
2) teach the RA to convert a shift to an LEA instruction if it wants something
in three-address form.
This gives us asm diffs like:
- leal (,%eax,4), %eax
+ shll $2, %eax
which is faster on some processors and smaller on all of them.
and, more interestingly:
- movl 24(%esi), %eax
- leal (,%eax,4), %edi
+ movl 24(%esi), %edi
+ shll $2, %edi
Without #2, #1 was a significant pessimization in some cases.
This implements CodeGen/X86/shift-codegen.ll
llvm-svn: 35204
2007-03-20 14:08:29 +08:00
|
|
|
// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
|
|
|
|
// a simple shift.
|
|
|
|
if (AM.Scale > 1)
|
2006-03-01 05:13:57 +08:00
|
|
|
Complexity++;
|
2006-02-25 18:09:08 +08:00
|
|
|
|
|
|
|
// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
|
2015-10-13 00:09:59 +08:00
|
|
|
// to a LEA. This is determined with some experimentation but is by no means
|
2006-02-25 18:09:08 +08:00
|
|
|
// optimal (especially for code size consideration). LEA is nice because of
|
|
|
|
// its three-address nature. Tweak the cost function again when we can run
|
|
|
|
// convertToThreeAddress() at register allocation time.
|
2009-02-07 08:43:41 +08:00
|
|
|
if (AM.hasSymbolicDisplacement()) {
|
2015-10-13 00:09:59 +08:00
|
|
|
// For X86-64, always use LEA to materialize RIP-relative addresses.
|
2006-12-06 06:03:40 +08:00
|
|
|
if (Subtarget->is64Bit())
|
2006-09-08 14:48:29 +08:00
|
|
|
Complexity = 4;
|
|
|
|
else
|
|
|
|
Complexity += 2;
|
|
|
|
}
|
2006-02-25 18:09:08 +08:00
|
|
|
|
2010-04-30 07:30:41 +08:00
|
|
|
if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
|
2006-02-25 18:09:08 +08:00
|
|
|
Complexity++;
|
|
|
|
|
2009-07-12 06:50:33 +08:00
|
|
|
// If it isn't worth using an LEA, reject it.
|
2009-07-12 07:07:30 +08:00
|
|
|
if (Complexity <= 2)
|
2009-07-12 06:50:33 +08:00
|
|
|
return false;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2016-04-13 05:34:24 +08:00
|
|
|
getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
|
2009-07-12 06:50:33 +08:00
|
|
|
return true;
|
2006-02-25 18:09:08 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// This is only run on TargetGlobalTLSAddress nodes.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
|
2009-06-21 04:38:48 +08:00
|
|
|
SDValue &Scale, SDValue &Index,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2009-06-21 04:38:48 +08:00
|
|
|
assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
|
|
|
|
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2009-06-21 04:38:48 +08:00
|
|
|
X86ISelAddressMode AM;
|
|
|
|
AM.GV = GA->getGlobal();
|
|
|
|
AM.Disp += GA->getOffset();
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
|
2009-06-27 05:18:37 +08:00
|
|
|
AM.SymbolFlags = GA->getTargetFlags();
|
|
|
|
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N.getValueType() == MVT::i32) {
|
2009-06-21 04:38:48 +08:00
|
|
|
AM.Scale = 1;
|
2009-08-12 04:47:22 +08:00
|
|
|
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
|
2009-06-21 04:38:48 +08:00
|
|
|
} else {
|
2009-08-12 04:47:22 +08:00
|
|
|
AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
|
2009-06-21 04:38:48 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
|
2009-06-21 04:38:48 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-11-10 07:53:43 +08:00
|
|
|
bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
|
|
|
|
if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
|
|
|
|
Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
|
|
|
|
N.getValueType());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-12-09 03:01:00 +08:00
|
|
|
// Keep track of the original value type and whether this value was
|
|
|
|
// truncated. If we see a truncation from pointer type to VT that truncates
|
|
|
|
// bits that are known to be zero, we can use a narrow reference.
|
|
|
|
EVT VT = N.getValueType();
|
|
|
|
bool WasTruncated = false;
|
|
|
|
if (N.getOpcode() == ISD::TRUNCATE) {
|
|
|
|
WasTruncated = true;
|
|
|
|
N = N.getOperand(0);
|
|
|
|
}
|
|
|
|
|
2016-11-10 07:53:43 +08:00
|
|
|
if (N.getOpcode() != X86ISD::Wrapper)
|
|
|
|
return false;
|
|
|
|
|
2016-12-09 03:01:00 +08:00
|
|
|
// We can only use non-GlobalValues as immediates if they were not truncated,
|
|
|
|
// as we do not have any range information. If we have a GlobalValue and the
|
|
|
|
// address was not truncated, we can select it as an operand directly.
|
|
|
|
unsigned Opc = N.getOperand(0)->getOpcode();
|
|
|
|
if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
|
|
|
|
Op = N.getOperand(0);
|
|
|
|
// We can only select the operand directly if we didn't have to look past a
|
|
|
|
// truncate.
|
|
|
|
return !WasTruncated;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that the global's range fits into VT.
|
|
|
|
auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
|
|
|
|
Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
|
|
|
|
if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Okay, we can use a narrow reference.
|
|
|
|
Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
|
|
|
|
GA->getOffset(), GA->getTargetFlags());
|
2016-11-17 05:48:59 +08:00
|
|
|
return true;
|
2016-11-10 07:53:43 +08:00
|
|
|
}
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2017-11-09 04:17:33 +08:00
|
|
|
bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
2010-03-03 06:20:06 +08:00
|
|
|
if (!ISD::isNON_EXTLoad(N.getNode()) ||
|
2017-11-09 04:17:33 +08:00
|
|
|
!IsProfitableToFold(N, P, Root) ||
|
|
|
|
!IsLegalToFold(N, P, Root, OptLevel))
|
2010-03-03 06:20:06 +08:00
|
|
|
return false;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
return selectAddr(N.getNode(),
|
2010-09-22 06:07:31 +08:00
|
|
|
N.getOperand(1), Base, Scale, Index, Disp, Segment);
|
2006-01-07 04:36:21 +08:00
|
|
|
}
|
|
|
|
|
2018-04-28 06:15:33 +08:00
|
|
|
bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
|
|
|
|
SDValue &Base, SDValue &Scale,
|
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
|
|
|
if (!ISD::isNON_EXTLoad(N.getNode()) ||
|
|
|
|
useNonTemporalLoad(cast<LoadSDNode>(N)) ||
|
|
|
|
!IsProfitableToFold(N, P, Root) ||
|
|
|
|
!IsLegalToFold(N, P, Root, OptLevel))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return selectAddr(N.getNode(),
|
|
|
|
N.getOperand(1), Base, Scale, Index, Disp, Segment);
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return an SDNode that returns the value of the global base register.
|
|
|
|
/// Output instructions required to initialize the global base register,
|
|
|
|
/// if necessary.
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
|
2009-06-04 04:20:00 +08:00
|
|
|
unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
|
2015-07-09 10:09:04 +08:00
|
|
|
auto &DL = MF->getDataLayout();
|
|
|
|
return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
|
2006-02-18 08:15:05 +08:00
|
|
|
}
|
|
|
|
|
2017-02-10 06:02:28 +08:00
|
|
|
bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
|
|
|
|
if (N->getOpcode() == ISD::TRUNCATE)
|
|
|
|
N = N->getOperand(0).getNode();
|
|
|
|
if (N->getOpcode() != X86ISD::Wrapper)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
|
|
|
|
if (!GA)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
|
|
|
|
return CR && CR->getSignedMin().sge(-1ull << Width) &&
|
|
|
|
CR->getSignedMax().slt(1ull << Width);
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Test whether the given X86ISD::CMP node has any uses which require the SF
|
|
|
|
/// or OF bits to be accurate.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool hasNoSignedComparisonUses(SDNode *N) {
|
2009-10-10 04:35:19 +08:00
|
|
|
// Examine each user of the node.
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(),
|
|
|
|
UE = N->use_end(); UI != UE; ++UI) {
|
|
|
|
// Only examine CopyToReg uses.
|
|
|
|
if (UI->getOpcode() != ISD::CopyToReg)
|
|
|
|
return false;
|
|
|
|
// Only examine CopyToReg uses that copy to EFLAGS.
|
|
|
|
if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
|
|
|
|
X86::EFLAGS)
|
|
|
|
return false;
|
|
|
|
// Examine each user of the CopyToReg use.
|
|
|
|
for (SDNode::use_iterator FlagUI = UI->use_begin(),
|
|
|
|
FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
|
|
|
|
// Only examine the Flag result.
|
|
|
|
if (FlagUI.getUse().getResNo() != 1) continue;
|
|
|
|
// Anything unusual: assume conservatively.
|
|
|
|
if (!FlagUI->isMachineOpcode()) return false;
|
|
|
|
// Examine the opcode of the user.
|
|
|
|
switch (FlagUI->getMachineOpcode()) {
|
|
|
|
// These comparisons don't treat the most significant bit specially.
|
|
|
|
case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
|
|
|
|
case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
|
|
|
|
case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
|
|
|
|
case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
|
2015-01-06 12:23:53 +08:00
|
|
|
case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
|
|
|
|
case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
|
2009-10-10 04:35:19 +08:00
|
|
|
case X86::CMOVA16rr: case X86::CMOVA16rm:
|
|
|
|
case X86::CMOVA32rr: case X86::CMOVA32rm:
|
|
|
|
case X86::CMOVA64rr: case X86::CMOVA64rm:
|
|
|
|
case X86::CMOVAE16rr: case X86::CMOVAE16rm:
|
|
|
|
case X86::CMOVAE32rr: case X86::CMOVAE32rm:
|
|
|
|
case X86::CMOVAE64rr: case X86::CMOVAE64rm:
|
|
|
|
case X86::CMOVB16rr: case X86::CMOVB16rm:
|
|
|
|
case X86::CMOVB32rr: case X86::CMOVB32rm:
|
|
|
|
case X86::CMOVB64rr: case X86::CMOVB64rm:
|
2010-10-06 07:00:14 +08:00
|
|
|
case X86::CMOVBE16rr: case X86::CMOVBE16rm:
|
|
|
|
case X86::CMOVBE32rr: case X86::CMOVBE32rm:
|
|
|
|
case X86::CMOVBE64rr: case X86::CMOVBE64rm:
|
2009-10-10 04:35:19 +08:00
|
|
|
case X86::CMOVE16rr: case X86::CMOVE16rm:
|
|
|
|
case X86::CMOVE32rr: case X86::CMOVE32rm:
|
|
|
|
case X86::CMOVE64rr: case X86::CMOVE64rm:
|
|
|
|
case X86::CMOVNE16rr: case X86::CMOVNE16rm:
|
|
|
|
case X86::CMOVNE32rr: case X86::CMOVNE32rm:
|
|
|
|
case X86::CMOVNE64rr: case X86::CMOVNE64rm:
|
|
|
|
case X86::CMOVNP16rr: case X86::CMOVNP16rm:
|
|
|
|
case X86::CMOVNP32rr: case X86::CMOVNP32rm:
|
|
|
|
case X86::CMOVNP64rr: case X86::CMOVNP64rm:
|
|
|
|
case X86::CMOVP16rr: case X86::CMOVP16rm:
|
|
|
|
case X86::CMOVP32rr: case X86::CMOVP32rm:
|
|
|
|
case X86::CMOVP64rr: case X86::CMOVP64rm:
|
|
|
|
continue;
|
|
|
|
// Anything else: assume conservatively.
|
|
|
|
default: return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-09-08 07:54:24 +08:00
|
|
|
/// Test whether the given node which sets flags has any uses which require the
|
|
|
|
/// CF flag to be accurate.
|
|
|
|
static bool hasNoCarryFlagUses(SDNode *N) {
|
|
|
|
// Examine each user of the node.
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
|
|
|
|
++UI) {
|
|
|
|
// Only check things that use the flags.
|
|
|
|
if (UI.getUse().getResNo() != 1)
|
|
|
|
continue;
|
|
|
|
// Only examine CopyToReg uses.
|
|
|
|
if (UI->getOpcode() != ISD::CopyToReg)
|
|
|
|
return false;
|
|
|
|
// Only examine CopyToReg uses that copy to EFLAGS.
|
|
|
|
if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
|
|
|
|
return false;
|
|
|
|
// Examine each user of the CopyToReg use.
|
|
|
|
for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
|
|
|
|
FlagUI != FlagUE; ++FlagUI) {
|
|
|
|
// Only examine the Flag result.
|
|
|
|
if (FlagUI.getUse().getResNo() != 1)
|
|
|
|
continue;
|
|
|
|
// Anything unusual: assume conservatively.
|
|
|
|
if (!FlagUI->isMachineOpcode())
|
|
|
|
return false;
|
|
|
|
// Examine the opcode of the user.
|
|
|
|
switch (FlagUI->getMachineOpcode()) {
|
|
|
|
// Comparisons which don't examine the CF flag.
|
|
|
|
case X86::SETOr: case X86::SETNOr: case X86::SETEr: case X86::SETNEr:
|
|
|
|
case X86::SETSr: case X86::SETNSr: case X86::SETPr: case X86::SETNPr:
|
|
|
|
case X86::SETLr: case X86::SETGEr: case X86::SETLEr: case X86::SETGr:
|
|
|
|
case X86::JO_1: case X86::JNO_1: case X86::JE_1: case X86::JNE_1:
|
|
|
|
case X86::JS_1: case X86::JNS_1: case X86::JP_1: case X86::JNP_1:
|
|
|
|
case X86::JL_1: case X86::JGE_1: case X86::JLE_1: case X86::JG_1:
|
|
|
|
case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
|
|
|
|
case X86::CMOVO16rm: case X86::CMOVO32rm: case X86::CMOVO64rm:
|
|
|
|
case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr:
|
|
|
|
case X86::CMOVNO16rm: case X86::CMOVNO32rm: case X86::CMOVNO64rm:
|
|
|
|
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
|
|
|
|
case X86::CMOVE16rm: case X86::CMOVE32rm: case X86::CMOVE64rm:
|
|
|
|
case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
|
|
|
|
case X86::CMOVNE16rm: case X86::CMOVNE32rm: case X86::CMOVNE64rm:
|
|
|
|
case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
|
|
|
|
case X86::CMOVS16rm: case X86::CMOVS32rm: case X86::CMOVS64rm:
|
|
|
|
case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
|
|
|
|
case X86::CMOVNS16rm: case X86::CMOVNS32rm: case X86::CMOVNS64rm:
|
|
|
|
case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
|
|
|
|
case X86::CMOVP16rm: case X86::CMOVP32rm: case X86::CMOVP64rm:
|
|
|
|
case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
|
|
|
|
case X86::CMOVNP16rm: case X86::CMOVNP32rm: case X86::CMOVNP64rm:
|
|
|
|
case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
|
|
|
|
case X86::CMOVL16rm: case X86::CMOVL32rm: case X86::CMOVL64rm:
|
|
|
|
case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
|
|
|
|
case X86::CMOVGE16rm: case X86::CMOVGE32rm: case X86::CMOVGE64rm:
|
|
|
|
case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
|
|
|
|
case X86::CMOVLE16rm: case X86::CMOVLE32rm: case X86::CMOVLE64rm:
|
|
|
|
case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
|
|
|
|
case X86::CMOVG16rm: case X86::CMOVG32rm: case X86::CMOVG64rm:
|
|
|
|
continue;
|
|
|
|
// Anything else: assume conservatively.
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-08-25 10:06:36 +08:00
|
|
|
/// Check whether or not the chain ending in StoreNode is suitable for doing
|
|
|
|
/// the {load; op; store} to modify transformation.
|
|
|
|
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
|
|
|
|
SDValue StoredVal, SelectionDAG *CurDAG,
|
|
|
|
LoadSDNode *&LoadNode,
|
|
|
|
SDValue &InputChain) {
|
2012-03-29 13:45:48 +08:00
|
|
|
// is the stored value result 0 of the load?
|
|
|
|
if (StoredVal.getResNo() != 0) return false;
|
|
|
|
|
|
|
|
// are there other uses of the loaded value than the inc or dec?
|
|
|
|
if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
|
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// is the store non-extending and non-indexed?
|
|
|
|
if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
|
2012-03-29 13:45:48 +08:00
|
|
|
return false;
|
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
SDValue Load = StoredVal->getOperand(0);
|
|
|
|
// Is the stored value a non-extending and non-indexed load?
|
|
|
|
if (!ISD::isNormalLoad(Load.getNode())) return false;
|
2012-03-29 13:45:48 +08:00
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// Return LoadNode by reference.
|
|
|
|
LoadNode = cast<LoadSDNode>(Load);
|
2012-03-29 13:45:48 +08:00
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// Is store the only read of the loaded value?
|
|
|
|
if (!Load.hasOneUse())
|
2012-03-29 13:45:48 +08:00
|
|
|
return false;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// Is the address of the store the same as the load?
|
|
|
|
if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
|
|
|
|
LoadNode->getOffset() != StoreNode->getOffset())
|
|
|
|
return false;
|
|
|
|
|
2018-03-20 04:19:46 +08:00
|
|
|
bool FoundLoad = false;
|
|
|
|
SmallVector<SDValue, 4> ChainOps;
|
|
|
|
SmallVector<const SDNode *, 4> LoopWorklist;
|
|
|
|
SmallPtrSet<const SDNode *, 16> Visited;
|
|
|
|
const unsigned int Max = 1024;
|
|
|
|
|
|
|
|
// Visualization of Load-Op-Store fusion:
|
|
|
|
// -------------------------
|
|
|
|
// Legend:
|
|
|
|
// *-lines = Chain operand dependencies.
|
|
|
|
// |-lines = Normal operand dependencies.
|
|
|
|
// Dependencies flow down and right. n-suffix references multiple nodes.
|
|
|
|
//
|
|
|
|
// C Xn C
|
|
|
|
// * * *
|
|
|
|
// * * *
|
|
|
|
// Xn A-LD Yn TF Yn
|
|
|
|
// * * \ | * |
|
|
|
|
// * * \ | * |
|
|
|
|
// * * \ | => A--LD_OP_ST
|
|
|
|
// * * \| \
|
|
|
|
// TF OP \
|
|
|
|
// * | \ Zn
|
|
|
|
// * | \
|
|
|
|
// A-ST Zn
|
|
|
|
//
|
|
|
|
|
|
|
|
// This merge induced dependences from: #1: Xn -> LD, OP, Zn
|
|
|
|
// #2: Yn -> LD
|
|
|
|
// #3: ST -> Zn
|
|
|
|
|
|
|
|
// Ensure the transform is safe by checking for the dual
|
|
|
|
// dependencies to make sure we do not induce a loop.
|
|
|
|
|
|
|
|
// As LD is a predecessor to both OP and ST we can do this by checking:
|
|
|
|
// a). if LD is a predecessor to a member of Xn or Yn.
|
|
|
|
// b). if a Zn is a predecessor to ST.
|
|
|
|
|
|
|
|
// However, (b) can only occur through being a chain predecessor to
|
|
|
|
// ST, which is the same as Zn being a member or predecessor of Xn,
|
|
|
|
// which is a subset of LD being a predecessor of Xn. So it's
|
|
|
|
// subsumed by check (a).
|
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
SDValue Chain = StoreNode->getChain();
|
2012-03-29 13:45:48 +08:00
|
|
|
|
2018-03-20 04:19:46 +08:00
|
|
|
// Gather X elements in ChainOps.
|
2012-04-13 03:14:21 +08:00
|
|
|
if (Chain == Load.getValue(1)) {
|
2018-03-20 04:19:46 +08:00
|
|
|
FoundLoad = true;
|
|
|
|
ChainOps.push_back(Load.getOperand(0));
|
2018-03-10 04:58:07 +08:00
|
|
|
} else if (Chain.getOpcode() == ISD::TokenFactor) {
|
2012-04-13 03:14:21 +08:00
|
|
|
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
|
|
|
|
SDValue Op = Chain.getOperand(i);
|
|
|
|
if (Op == Load.getValue(1)) {
|
2018-03-20 04:19:46 +08:00
|
|
|
FoundLoad = true;
|
2017-02-02 22:39:26 +08:00
|
|
|
// Drop Load, but keep its chain. No cycle check necessary.
|
|
|
|
ChainOps.push_back(Load.getOperand(0));
|
2012-04-13 03:14:21 +08:00
|
|
|
continue;
|
|
|
|
}
|
2018-03-20 04:19:46 +08:00
|
|
|
LoopWorklist.push_back(Op.getNode());
|
2018-03-18 03:24:54 +08:00
|
|
|
ChainOps.push_back(Op);
|
|
|
|
}
|
|
|
|
}
|
2018-03-20 04:19:46 +08:00
|
|
|
|
|
|
|
if (!FoundLoad)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Worklist is currently Xn. Add Yn to worklist.
|
|
|
|
for (SDValue Op : StoredVal->ops())
|
|
|
|
if (Op.getNode() != LoadNode)
|
|
|
|
LoopWorklist.push_back(Op.getNode());
|
|
|
|
|
|
|
|
// Check (a) if Load is a predecessor to Xn + Yn
|
|
|
|
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
|
|
|
|
true))
|
2018-03-10 04:58:07 +08:00
|
|
|
return false;
|
|
|
|
|
2018-03-20 04:19:46 +08:00
|
|
|
InputChain =
|
|
|
|
CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
|
2018-03-10 04:58:07 +08:00
|
|
|
return true;
|
2018-03-10 10:16:15 +08:00
|
|
|
}
|
2012-03-29 13:45:48 +08:00
|
|
|
|
2017-08-26 06:50:52 +08:00
|
|
|
// Change a chain of {load; op; store} of the same value into a simple op
|
|
|
|
// through memory of that value, if the uses of the modified value and its
|
|
|
|
// address are suitable.
|
|
|
|
//
|
|
|
|
// The tablegen pattern memory operand pattern is currently not able to match
|
|
|
|
// the case where the EFLAGS on the original operation are used.
|
|
|
|
//
|
|
|
|
// To move this to tablegen, we'll need to improve tablegen to allow flags to
|
|
|
|
// be transferred from a node in the pattern to the result node, probably with
|
|
|
|
// a new keyword. For example, we have this
|
2017-08-25 10:04:03 +08:00
|
|
|
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
|
|
|
|
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
|
|
|
|
// (implicit EFLAGS)]>;
|
|
|
|
// but maybe need something like this
|
|
|
|
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
|
|
|
|
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
|
|
|
|
// (transferrable EFLAGS)]>;
|
|
|
|
//
|
2017-08-26 06:50:52 +08:00
|
|
|
// Until then, we manually fold these and instruction select the operation
|
|
|
|
// here.
|
2017-08-25 10:04:03 +08:00
|
|
|
bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
|
|
|
|
StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
|
|
|
|
SDValue StoredVal = StoreNode->getOperand(1);
|
|
|
|
unsigned Opc = StoredVal->getOpcode();
|
|
|
|
|
2017-08-26 06:50:52 +08:00
|
|
|
// Before we try to select anything, make sure this is memory operand size
|
|
|
|
// and opcode we can handle. Note that this must match the code below that
|
|
|
|
// actually lowers the opcodes.
|
2017-08-25 10:06:36 +08:00
|
|
|
EVT MemVT = StoreNode->getMemoryVT();
|
2017-08-26 06:50:52 +08:00
|
|
|
if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
|
|
|
|
MemVT != MVT::i8)
|
2017-08-25 10:06:36 +08:00
|
|
|
return false;
|
2017-08-26 06:50:52 +08:00
|
|
|
switch (Opc) {
|
|
|
|
default:
|
2017-08-25 10:06:36 +08:00
|
|
|
return false;
|
2017-08-26 06:50:52 +08:00
|
|
|
case X86ISD::INC:
|
|
|
|
case X86ISD::DEC:
|
|
|
|
case X86ISD::ADD:
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::ADC:
|
2017-08-26 06:50:52 +08:00
|
|
|
case X86ISD::SUB:
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::SBB:
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::AND:
|
|
|
|
case X86ISD::OR:
|
|
|
|
case X86ISD::XOR:
|
2017-08-26 06:50:52 +08:00
|
|
|
break;
|
|
|
|
}
|
2017-08-25 10:06:36 +08:00
|
|
|
|
2017-08-25 10:04:03 +08:00
|
|
|
LoadSDNode *LoadNode = nullptr;
|
|
|
|
SDValue InputChain;
|
2017-08-25 10:06:36 +08:00
|
|
|
if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode,
|
|
|
|
InputChain))
|
2017-08-25 10:04:03 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
SDValue Base, Scale, Index, Disp, Segment;
|
|
|
|
if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment))
|
|
|
|
return false;
|
|
|
|
|
2017-09-08 07:54:24 +08:00
|
|
|
auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
|
2017-09-09 02:23:42 +08:00
|
|
|
unsigned Opc8) {
|
2017-08-26 06:50:52 +08:00
|
|
|
switch (MemVT.getSimpleVT().SimpleTy) {
|
|
|
|
case MVT::i64:
|
|
|
|
return Opc64;
|
|
|
|
case MVT::i32:
|
|
|
|
return Opc32;
|
|
|
|
case MVT::i16:
|
|
|
|
return Opc16;
|
|
|
|
case MVT::i8:
|
|
|
|
return Opc8;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid size!");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
MachineSDNode *Result;
|
|
|
|
switch (Opc) {
|
|
|
|
case X86ISD::INC:
|
|
|
|
case X86ISD::DEC: {
|
2017-09-08 07:54:24 +08:00
|
|
|
unsigned NewOpc =
|
|
|
|
Opc == X86ISD::INC
|
|
|
|
? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
|
|
|
|
: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
|
2017-08-26 06:50:52 +08:00
|
|
|
const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
|
|
|
|
Result =
|
|
|
|
CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case X86ISD::ADD:
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::ADC:
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::SUB:
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::SBB:
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::AND:
|
|
|
|
case X86ISD::OR:
|
|
|
|
case X86ISD::XOR: {
|
2017-09-08 07:54:24 +08:00
|
|
|
auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
|
|
|
|
switch (Opc) {
|
|
|
|
case X86ISD::ADD:
|
|
|
|
return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
|
|
|
|
X86::ADD8mr);
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::ADC:
|
|
|
|
return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
|
|
|
|
X86::ADC8mr);
|
2017-09-08 07:54:24 +08:00
|
|
|
case X86ISD::SUB:
|
|
|
|
return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
|
|
|
|
X86::SUB8mr);
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::SBB:
|
|
|
|
return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
|
|
|
|
X86::SBB8mr);
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::AND:
|
|
|
|
return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
|
|
|
|
X86::AND8mr);
|
|
|
|
case X86ISD::OR:
|
|
|
|
return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
|
|
|
|
case X86ISD::XOR:
|
|
|
|
return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
|
|
|
|
X86::XOR8mr);
|
2017-09-08 07:54:24 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid opcode!");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
|
|
|
|
switch (Opc) {
|
|
|
|
case X86ISD::ADD:
|
2017-09-09 02:23:42 +08:00
|
|
|
return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::ADC:
|
|
|
|
return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
|
2017-09-08 07:54:24 +08:00
|
|
|
case X86ISD::SUB:
|
2017-09-09 02:23:42 +08:00
|
|
|
return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::SBB:
|
|
|
|
return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::AND:
|
2017-09-09 02:23:42 +08:00
|
|
|
return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::OR:
|
2017-09-09 02:23:42 +08:00
|
|
|
return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::XOR:
|
2017-09-09 02:23:42 +08:00
|
|
|
return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
|
2017-09-08 07:54:24 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid opcode!");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
|
|
|
|
switch (Opc) {
|
|
|
|
case X86ISD::ADD:
|
|
|
|
return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
|
|
|
|
X86::ADD8mi);
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::ADC:
|
|
|
|
return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
|
|
|
|
X86::ADC8mi);
|
2017-09-08 07:54:24 +08:00
|
|
|
case X86ISD::SUB:
|
|
|
|
return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
|
|
|
|
X86::SUB8mi);
|
2018-01-19 23:37:57 +08:00
|
|
|
case X86ISD::SBB:
|
|
|
|
return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
|
|
|
|
X86::SBB8mi);
|
2017-09-08 08:17:12 +08:00
|
|
|
case X86ISD::AND:
|
|
|
|
return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
|
|
|
|
X86::AND8mi);
|
|
|
|
case X86ISD::OR:
|
|
|
|
return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
|
|
|
|
X86::OR8mi);
|
|
|
|
case X86ISD::XOR:
|
|
|
|
return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
|
|
|
|
X86::XOR8mi);
|
2017-09-08 07:54:24 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid opcode!");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
unsigned NewOpc = SelectRegOpcode(Opc);
|
|
|
|
SDValue Operand = StoredVal->getOperand(1);
|
|
|
|
|
|
|
|
// See if the operand is a constant that we can fold into an immediate
|
|
|
|
// operand.
|
|
|
|
if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
|
|
|
|
auto OperandV = OperandC->getAPIntValue();
|
|
|
|
|
|
|
|
// Check if we can shrink the operand enough to fit in an immediate (or
|
|
|
|
// fit into a smaller immediate) by negating it and switching the
|
|
|
|
// operation.
|
2017-09-08 08:17:12 +08:00
|
|
|
if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
|
|
|
|
((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 &&
|
2017-09-08 07:54:24 +08:00
|
|
|
(-OperandV).getMinSignedBits() <= 8) ||
|
|
|
|
(MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
|
|
|
|
(-OperandV).getMinSignedBits() <= 32)) &&
|
|
|
|
hasNoCarryFlagUses(StoredVal.getNode())) {
|
|
|
|
OperandV = -OperandV;
|
|
|
|
Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
|
|
|
|
}
|
|
|
|
|
|
|
|
// First try to fit this into an Imm8 operand. If it doesn't fit, then try
|
|
|
|
// the larger immediate operand.
|
|
|
|
if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) {
|
|
|
|
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
|
|
|
|
NewOpc = SelectImm8Opcode(Opc);
|
|
|
|
} else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() &&
|
|
|
|
(MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) {
|
|
|
|
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
|
|
|
|
NewOpc = SelectImmOpcode(Opc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-01-19 23:37:57 +08:00
|
|
|
if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
|
|
|
|
SDValue CopyTo =
|
|
|
|
CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
|
|
|
|
StoredVal.getOperand(2), SDValue());
|
|
|
|
|
|
|
|
const SDValue Ops[] = {Base, Scale, Index, Disp,
|
|
|
|
Segment, Operand, CopyTo, CopyTo.getValue(1)};
|
|
|
|
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
|
|
|
|
Ops);
|
|
|
|
} else {
|
|
|
|
const SDValue Ops[] = {Base, Scale, Index, Disp,
|
|
|
|
Segment, Operand, InputChain};
|
|
|
|
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
|
|
|
|
Ops);
|
|
|
|
}
|
2017-08-26 06:50:52 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid opcode!");
|
|
|
|
}
|
|
|
|
|
2017-08-25 10:04:03 +08:00
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
|
|
|
|
MemOp[0] = StoreNode->getMemOperand();
|
|
|
|
MemOp[1] = LoadNode->getMemOperand();
|
|
|
|
Result->setMemRefs(MemOp, MemOp + 2);
|
|
|
|
|
2018-03-20 04:19:46 +08:00
|
|
|
// Update Load Chain uses as well.
|
|
|
|
ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
|
2017-08-25 10:04:03 +08:00
|
|
|
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
|
|
|
|
ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
|
|
|
|
CurDAG->RemoveDeadNode(Node);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-09-13 01:40:25 +08:00
|
|
|
// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
|
|
|
|
bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
|
|
|
|
MVT NVT = Node->getSimpleValueType(0);
|
|
|
|
SDLoc dl(Node);
|
|
|
|
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
|
|
|
|
if (!Subtarget->hasBMI() && !Subtarget->hasTBM())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Must have a shift right.
|
|
|
|
if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Shift can't have additional users.
|
|
|
|
if (!N0->hasOneUse())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Only supported for 32 and 64 bits.
|
|
|
|
if (NVT != MVT::i32 && NVT != MVT::i64)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Shift amount and RHS of and must be constant.
|
|
|
|
ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
|
|
|
|
ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
|
|
|
|
if (!MaskCst || !ShiftCst)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// And RHS must be a mask.
|
|
|
|
uint64_t Mask = MaskCst->getZExtValue();
|
|
|
|
if (!isMask_64(Mask))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
uint64_t Shift = ShiftCst->getZExtValue();
|
|
|
|
uint64_t MaskSize = countPopulation(Mask);
|
|
|
|
|
|
|
|
// Don't interfere with something that can be handled by extracting AH.
|
|
|
|
// TODO: If we are able to fold a load, BEXTR might still be better than AH.
|
|
|
|
if (Shift == 8 && MaskSize == 8)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Make sure we are only using bits that were in the original value, not
|
|
|
|
// shifted in.
|
|
|
|
if (Shift + MaskSize > NVT.getSizeInBits())
|
|
|
|
return false;
|
|
|
|
|
2018-02-13 05:18:11 +08:00
|
|
|
// Create a BEXTR node and run it through selection.
|
|
|
|
SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT);
|
|
|
|
SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT,
|
|
|
|
N0->getOperand(0), C);
|
|
|
|
ReplaceNode(Node, New.getNode());
|
|
|
|
SelectCode(New.getNode());
|
2017-09-13 01:40:25 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-04-28 06:15:33 +08:00
|
|
|
// Emit a PCMISTR(I/M) instruction.
|
|
|
|
MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
|
|
|
|
bool MayFoldLoad, const SDLoc &dl,
|
|
|
|
MVT VT, SDNode *Node) {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
SDValue Imm = Node->getOperand(2);
|
|
|
|
const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
|
|
|
|
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
|
|
|
|
|
|
|
|
// If there is a load, it will be behind a bitcast. We don't need to check
|
|
|
|
// alignment on this load.
|
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
|
|
|
if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
|
|
|
|
tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
|
|
|
|
Tmp3, Tmp4)) {
|
|
|
|
SDValue Load = N1.getOperand(0);
|
|
|
|
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
|
|
|
|
Load.getOperand(0) };
|
|
|
|
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
|
|
|
|
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
|
|
|
// Update the chain.
|
|
|
|
ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
|
|
|
|
// Record the mem-refs
|
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
|
|
|
|
MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
|
|
|
|
CNode->setMemRefs(MemOp, MemOp + 1);
|
|
|
|
return CNode;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Ops[] = { N0, N1, Imm };
|
|
|
|
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
|
|
|
|
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
|
|
|
|
return CNode;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
|
|
|
|
// to emit a second instruction after this one. This is needed since we have two
|
|
|
|
// copyToReg nodes glued before this and we need to continue that glue through.
|
|
|
|
MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
|
|
|
|
bool MayFoldLoad, const SDLoc &dl,
|
|
|
|
MVT VT, SDNode *Node,
|
|
|
|
SDValue &InFlag) {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N2 = Node->getOperand(2);
|
|
|
|
SDValue Imm = Node->getOperand(4);
|
|
|
|
const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
|
|
|
|
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
|
|
|
|
|
|
|
|
// If there is a load, it will be behind a bitcast. We don't need to check
|
|
|
|
// alignment on this load.
|
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
|
|
|
if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
|
|
|
|
tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
|
|
|
|
Tmp3, Tmp4)) {
|
|
|
|
SDValue Load = N2.getOperand(0);
|
|
|
|
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
|
|
|
|
Load.getOperand(0), InFlag };
|
|
|
|
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
|
|
|
|
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
|
|
|
InFlag = SDValue(CNode, 3);
|
|
|
|
// Update the chain.
|
|
|
|
ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
|
|
|
|
// Record the mem-refs
|
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
|
|
|
|
MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
|
|
|
|
CNode->setMemRefs(MemOp, MemOp + 1);
|
|
|
|
return CNode;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Ops[] = { N0, N2, Imm, InFlag };
|
|
|
|
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
|
|
|
|
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
|
|
|
|
InFlag = SDValue(CNode, 2);
|
|
|
|
return CNode;
|
|
|
|
}
|
|
|
|
|
2018-01-20 00:37:25 +08:00
|
|
|
/// If the high bits of an 'and' operand are known zero, try setting the
|
|
|
|
/// high bits of an 'and' constant operand to produce a smaller encoding by
|
|
|
|
/// creating a small, sign-extended negative immediate rather than a large
|
|
|
|
/// positive one. This reverses a transform in SimplifyDemandedBits that
|
|
|
|
/// shrinks mask constants by clearing bits. There is also a possibility that
|
|
|
|
/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
|
|
|
|
/// case, just replace the 'and'. Return 'true' if the node is replaced.
|
|
|
|
bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
|
|
|
|
// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
|
|
|
|
// have immediate operands.
|
|
|
|
MVT VT = And->getSimpleValueType(0);
|
|
|
|
if (VT != MVT::i32 && VT != MVT::i64)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
|
|
|
|
if (!And1C)
|
|
|
|
return false;
|
|
|
|
|
2018-02-06 00:54:07 +08:00
|
|
|
// Bail out if the mask constant is already negative. It's can't shrink more.
|
|
|
|
// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
|
|
|
|
// patterns to use a 32-bit and instead of a 64-bit and by relying on the
|
|
|
|
// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
|
|
|
|
// are negative too.
|
2018-01-20 00:37:25 +08:00
|
|
|
APInt MaskVal = And1C->getAPIntValue();
|
|
|
|
unsigned MaskLZ = MaskVal.countLeadingZeros();
|
2018-02-06 00:54:07 +08:00
|
|
|
if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
|
2018-01-20 00:37:25 +08:00
|
|
|
return false;
|
|
|
|
|
2018-02-06 00:54:07 +08:00
|
|
|
// Don't extend into the upper 32 bits of a 64 bit mask.
|
|
|
|
if (VT == MVT::i64 && MaskLZ >= 32) {
|
|
|
|
MaskLZ -= 32;
|
|
|
|
MaskVal = MaskVal.trunc(32);
|
|
|
|
}
|
|
|
|
|
2018-01-20 00:37:25 +08:00
|
|
|
SDValue And0 = And->getOperand(0);
|
2018-02-06 00:54:07 +08:00
|
|
|
APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
|
2018-01-20 00:37:25 +08:00
|
|
|
APInt NegMaskVal = MaskVal | HighZeros;
|
|
|
|
|
|
|
|
// If a negative constant would not allow a smaller encoding, there's no need
|
|
|
|
// to continue. Only change the constant when we know it's a win.
|
|
|
|
unsigned MinWidth = NegMaskVal.getMinSignedBits();
|
|
|
|
if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
|
|
|
|
return false;
|
|
|
|
|
2018-02-06 00:54:07 +08:00
|
|
|
// Extend masks if we truncated above.
|
|
|
|
if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
|
|
|
|
NegMaskVal = NegMaskVal.zext(64);
|
|
|
|
HighZeros = HighZeros.zext(64);
|
|
|
|
}
|
|
|
|
|
2018-01-20 00:37:25 +08:00
|
|
|
// The variable operand must be all zeros in the top bits to allow using the
|
|
|
|
// new, negative constant as the mask.
|
|
|
|
if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Check if the mask is -1. In that case, this is an unnecessary instruction
|
|
|
|
// that escaped earlier analysis.
|
|
|
|
if (NegMaskVal.isAllOnesValue()) {
|
|
|
|
ReplaceNode(And, And0.getNode());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// A negative mask allows a smaller encoding. Create a new 'and' node.
|
|
|
|
SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
|
|
|
|
SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
|
|
|
|
ReplaceNode(And, NewAnd.getNode());
|
|
|
|
SelectCode(NewAnd.getNode());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-05-11 07:55:37 +08:00
|
|
|
void X86DAGToDAGISel::Select(SDNode *Node) {
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT NVT = Node->getSimpleValueType(0);
|
2006-01-07 04:36:21 +08:00
|
|
|
unsigned Opc, MOpc;
|
|
|
|
unsigned Opcode = Node->getOpcode();
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(Node);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2008-07-18 03:10:17 +08:00
|
|
|
if (Node->isMachineOpcode()) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
|
2013-09-22 16:21:56 +08:00
|
|
|
Node->setNodeId(-1);
|
2016-05-11 07:55:37 +08:00
|
|
|
return; // Already selected.
|
2006-02-09 08:37:58 +08:00
|
|
|
}
|
2006-01-12 06:15:18 +08:00
|
|
|
|
2006-01-07 04:36:21 +08:00
|
|
|
switch (Opcode) {
|
2015-08-19 19:35:10 +08:00
|
|
|
default: break;
|
2015-08-20 00:17:08 +08:00
|
|
|
case ISD::BRIND: {
|
|
|
|
if (Subtarget->isTargetNaCl())
|
|
|
|
// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
|
|
|
|
// leave the instruction alone.
|
|
|
|
break;
|
|
|
|
if (Subtarget->isTarget64BitILP32()) {
|
|
|
|
// Converts a 32-bit register to a 64-bit, zero-extended version of
|
|
|
|
// it. This is needed because x86-64 can do many things, but jmp %r32
|
|
|
|
// ain't one of them.
|
|
|
|
const SDValue &Target = Node->getOperand(1);
|
|
|
|
assert(Target.getSimpleValueType() == llvm::MVT::i32);
|
|
|
|
SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
|
|
|
|
SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
|
|
|
|
Node->getOperand(0), ZextTarget);
|
2016-05-14 07:26:28 +08:00
|
|
|
ReplaceNode(Node, Brind.getNode());
|
2015-08-20 00:17:08 +08:00
|
|
|
SelectCode(ZextTarget.getNode());
|
|
|
|
SelectCode(Brind.getNode());
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2015-08-20 00:17:08 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
case X86ISD::GlobalBaseReg:
|
2016-05-12 05:13:17 +08:00
|
|
|
ReplaceNode(Node, getGlobalBaseReg());
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-03 00:10:52 +08:00
|
|
|
|
2017-09-20 01:19:45 +08:00
|
|
|
case X86ISD::SELECT:
|
2014-11-06 10:25:03 +08:00
|
|
|
case X86ISD::SHRUNKBLEND: {
|
2017-09-20 01:19:45 +08:00
|
|
|
// SHRUNKBLEND selects like a regular VSELECT. Same with X86ISD::SELECT.
|
2014-11-06 10:25:03 +08:00
|
|
|
SDValue VSelect = CurDAG->getNode(
|
|
|
|
ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
|
|
|
|
Node->getOperand(1), Node->getOperand(2));
|
2017-09-09 13:57:19 +08:00
|
|
|
ReplaceNode(Node, VSelect.getNode());
|
2014-11-06 10:25:03 +08:00
|
|
|
SelectCode(VSelect.getNode());
|
|
|
|
// We already called ReplaceUses.
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2014-11-06 10:25:03 +08:00
|
|
|
}
|
2012-07-01 10:55:34 +08:00
|
|
|
|
2015-08-19 19:35:10 +08:00
|
|
|
case ISD::AND:
|
2017-09-13 01:40:25 +08:00
|
|
|
if (matchBEXTRFromAnd(Node))
|
|
|
|
return;
|
2018-01-20 00:37:25 +08:00
|
|
|
if (shrinkAndImmediate(Node))
|
|
|
|
return;
|
2017-09-13 01:40:25 +08:00
|
|
|
|
|
|
|
LLVM_FALLTHROUGH;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
case ISD::OR:
|
|
|
|
case ISD::XOR: {
|
2017-09-13 01:40:25 +08:00
|
|
|
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
// For operations of the form (x << C1) op C2, check if we can use a smaller
|
|
|
|
// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
|
|
|
|
if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
|
|
|
|
break;
|
|
|
|
|
|
|
|
// i8 is unshrinkable, i16 should be promoted to i32.
|
|
|
|
if (NVT != MVT::i32 && NVT != MVT::i64)
|
|
|
|
break;
|
|
|
|
|
|
|
|
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
|
|
|
|
ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
|
|
|
|
if (!Cst || !ShlCst)
|
|
|
|
break;
|
|
|
|
|
|
|
|
int64_t Val = Cst->getSExtValue();
|
|
|
|
uint64_t ShlVal = ShlCst->getZExtValue();
|
|
|
|
|
|
|
|
// Make sure that we don't change the operation by removing bits.
|
|
|
|
// This only matters for OR and XOR, AND is unaffected.
|
2012-08-25 07:29:28 +08:00
|
|
|
uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
|
|
|
|
if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
break;
|
|
|
|
|
2015-04-02 03:01:09 +08:00
|
|
|
unsigned ShlOp, AddOp, Op;
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT CstVT = NVT;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
|
|
|
|
// Check the minimum bitwidth for the new constant.
|
|
|
|
// TODO: AND32ri is the same as AND64ri32 with zext imm.
|
|
|
|
// TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
|
|
|
|
// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
|
|
|
|
if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
|
|
|
|
CstVT = MVT::i8;
|
|
|
|
else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
|
|
|
|
CstVT = MVT::i32;
|
|
|
|
|
|
|
|
// Bail if there is no smaller encoding.
|
|
|
|
if (NVT == CstVT)
|
|
|
|
break;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
|
|
|
case MVT::i32:
|
|
|
|
assert(CstVT == MVT::i8);
|
|
|
|
ShlOp = X86::SHL32ri;
|
2015-04-02 03:01:09 +08:00
|
|
|
AddOp = X86::ADD32rr;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
|
|
|
|
switch (Opcode) {
|
2012-08-12 01:44:14 +08:00
|
|
|
default: llvm_unreachable("Impossible opcode");
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
case ISD::AND: Op = X86::AND32ri8; break;
|
|
|
|
case ISD::OR: Op = X86::OR32ri8; break;
|
|
|
|
case ISD::XOR: Op = X86::XOR32ri8; break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case MVT::i64:
|
|
|
|
assert(CstVT == MVT::i8 || CstVT == MVT::i32);
|
|
|
|
ShlOp = X86::SHL64ri;
|
2015-04-02 03:01:09 +08:00
|
|
|
AddOp = X86::ADD64rr;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
|
|
|
|
switch (Opcode) {
|
2012-08-12 01:44:14 +08:00
|
|
|
default: llvm_unreachable("Impossible opcode");
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
|
|
|
|
case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
|
|
|
|
case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the smaller op and the shift.
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
|
2015-04-02 03:01:09 +08:00
|
|
|
if (ShlVal == 1)
|
2016-05-11 07:55:37 +08:00
|
|
|
CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
|
|
|
|
SDValue(New, 0));
|
|
|
|
else
|
|
|
|
CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
|
|
|
|
getI8Imm(ShlVal, dl));
|
|
|
|
return;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
}
|
[X86] Improve mul w/ overflow codegen, to MUL8+SETO.
Currently, @llvm.smul.with.overflow.i8 expands to 9 instructions, where
3 are really needed.
This adds X86ISD::UMUL8/SMUL8 SD nodes, and custom lowers them to
MUL8/IMUL8 + SETO.
i8 is a special case because there is no two/three operand variants of
(I)MUL8, so the first operand and return value need to go in AL/AX.
Also, we can't write patterns for these instructions: TableGen refuses
patterns where output operands don't match SDNode results. In this case,
instructions where the output operand is an implicitly defined register.
A related special case (and FIXME) exists for MUL8 (X86InstrArith.td):
// FIXME: Used for 8-bit mul, ignore result upper 8 bits.
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
[(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]
Ideally, these go away with UMUL8, but we still need to improve TableGen
support of implicit operands in patterns.
Before this change:
movsbl %sil, %eax
movsbl %dil, %ecx
imull %eax, %ecx
movb %cl, %al
sarb $7, %al
movzbl %al, %eax
movzbl %ch, %esi
cmpl %eax, %esi
setne %al
After:
movb %dil, %al
imulb %sil
seto %al
Also, remove a made-redundant testcase for PR19858, and enable more FastISel
ALU-overflow tests for SelectionDAG too.
Differential Revision: http://reviews.llvm.org/D5809
llvm-svn: 220516
2014-10-24 05:55:31 +08:00
|
|
|
case X86ISD::UMUL8:
|
|
|
|
case X86ISD::SMUL8: {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
|
|
|
|
Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
|
|
|
|
|
|
|
|
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
|
|
|
|
N0, SDValue()).getValue(1);
|
|
|
|
|
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
|
|
|
|
SDValue Ops[] = {N1, InFlag};
|
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
|
|
|
|
2016-05-12 05:13:17 +08:00
|
|
|
ReplaceNode(Node, CNode);
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
[X86] Improve mul w/ overflow codegen, to MUL8+SETO.
Currently, @llvm.smul.with.overflow.i8 expands to 9 instructions, where
3 are really needed.
This adds X86ISD::UMUL8/SMUL8 SD nodes, and custom lowers them to
MUL8/IMUL8 + SETO.
i8 is a special case because there is no two/three operand variants of
(I)MUL8, so the first operand and return value need to go in AL/AX.
Also, we can't write patterns for these instructions: TableGen refuses
patterns where output operands don't match SDNode results. In this case,
instructions where the output operand is an implicitly defined register.
A related special case (and FIXME) exists for MUL8 (X86InstrArith.td):
// FIXME: Used for 8-bit mul, ignore result upper 8 bits.
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
[(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]
Ideally, these go away with UMUL8, but we still need to improve TableGen
support of implicit operands in patterns.
Before this change:
movsbl %sil, %eax
movsbl %dil, %ecx
imull %eax, %ecx
movb %cl, %al
sarb $7, %al
movzbl %al, %eax
movzbl %ch, %esi
cmpl %eax, %esi
setne %al
After:
movb %dil, %al
imulb %sil
seto %al
Also, remove a made-redundant testcase for PR19858, and enable more FastISel
ALU-overflow tests for SelectionDAG too.
Differential Revision: http://reviews.llvm.org/D5809
llvm-svn: 220516
2014-10-24 05:55:31 +08:00
|
|
|
}
|
|
|
|
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
case X86ISD::UMUL: {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2011-01-15 06:34:13 +08:00
|
|
|
unsigned LoReg;
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2017-09-29 00:56:36 +08:00
|
|
|
// MVT::i8 is handled by X86ISD::UMUL8.
|
2011-01-15 06:34:13 +08:00
|
|
|
case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
|
|
|
|
case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
|
|
|
|
case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
|
|
|
|
N0, SDValue()).getValue(1);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
|
|
|
|
SDValue Ops[] = {N1, InFlag};
|
2013-04-20 06:22:57 +08:00
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2016-05-12 06:21:50 +08:00
|
|
|
ReplaceNode(Node, CNode);
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
case ISD::SMUL_LOHI:
|
|
|
|
case ISD::UMUL_LOHI: {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
2006-01-07 04:36:21 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
bool isSigned = Opcode == ISD::SMUL_LOHI;
|
2012-09-26 16:22:37 +08:00
|
|
|
bool hasBMI2 = Subtarget->hasBMI2();
|
2009-08-08 05:33:25 +08:00
|
|
|
if (!isSigned) {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2012-09-26 16:22:37 +08:00
|
|
|
case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
|
|
|
|
MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
|
|
|
|
case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
|
|
|
|
MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
|
2006-01-07 04:36:21 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
} else {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-08-03 00:10:52 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
|
|
|
|
case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
|
2006-01-07 07:19:29 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
}
|
2006-01-07 07:19:29 +08:00
|
|
|
|
2012-09-26 16:22:37 +08:00
|
|
|
unsigned SrcReg, LoReg, HiReg;
|
|
|
|
switch (Opc) {
|
|
|
|
default: llvm_unreachable("Unknown MUL opcode!");
|
|
|
|
case X86::IMUL32r:
|
|
|
|
case X86::MUL32r:
|
|
|
|
SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
|
|
|
|
break;
|
|
|
|
case X86::IMUL64r:
|
|
|
|
case X86::MUL64r:
|
|
|
|
SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
|
|
|
|
break;
|
|
|
|
case X86::MULX32rr:
|
|
|
|
SrcReg = X86::EDX; LoReg = HiReg = 0;
|
|
|
|
break;
|
|
|
|
case X86::MULX64rr:
|
|
|
|
SrcReg = X86::RDX; LoReg = HiReg = 0;
|
|
|
|
break;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2006-01-07 04:36:21 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
2015-10-14 00:23:00 +08:00
|
|
|
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
|
2009-08-08 05:33:25 +08:00
|
|
|
// Multiply is commmutative.
|
2009-08-03 00:10:52 +08:00
|
|
|
if (!foldedLoad) {
|
2015-10-14 00:23:00 +08:00
|
|
|
foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
|
2009-08-03 00:10:52 +08:00
|
|
|
if (foldedLoad)
|
|
|
|
std::swap(N0, N1);
|
|
|
|
}
|
|
|
|
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
|
2012-05-23 13:44:51 +08:00
|
|
|
N0, SDValue()).getValue(1);
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue ResHi, ResLo;
|
2009-08-03 00:10:52 +08:00
|
|
|
|
|
|
|
if (foldedLoad) {
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue Chain;
|
2016-06-24 05:40:35 +08:00
|
|
|
MachineSDNode *CNode = nullptr;
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
|
|
|
|
InFlag };
|
2012-09-26 16:22:37 +08:00
|
|
|
if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
|
2016-06-24 05:40:35 +08:00
|
|
|
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
ResHi = SDValue(CNode, 0);
|
|
|
|
ResLo = SDValue(CNode, 1);
|
|
|
|
Chain = SDValue(CNode, 2);
|
|
|
|
InFlag = SDValue(CNode, 3);
|
|
|
|
} else {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
|
2016-06-24 05:40:35 +08:00
|
|
|
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
Chain = SDValue(CNode, 0);
|
|
|
|
InFlag = SDValue(CNode, 1);
|
|
|
|
}
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
// Update the chain.
|
2012-09-26 16:22:37 +08:00
|
|
|
ReplaceUses(N1.getValue(1), Chain);
|
2016-06-24 05:40:35 +08:00
|
|
|
// Record the mem-refs
|
2017-11-09 06:26:37 +08:00
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
|
|
|
|
MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand();
|
|
|
|
CNode->setMemRefs(MemOp, MemOp + 1);
|
2009-08-03 00:10:52 +08:00
|
|
|
} else {
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue Ops[] = { N1, InFlag };
|
|
|
|
if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
|
2013-04-20 06:22:57 +08:00
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
ResHi = SDValue(CNode, 0);
|
|
|
|
ResLo = SDValue(CNode, 1);
|
|
|
|
InFlag = SDValue(CNode, 2);
|
|
|
|
} else {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(MVT::Glue);
|
2013-04-20 06:22:57 +08:00
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
InFlag = SDValue(CNode, 0);
|
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Copy the low half of the result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 0).use_empty()) {
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!ResLo.getNode()) {
|
2012-09-26 16:22:37 +08:00
|
|
|
assert(LoReg && "Register for low half is not defined!");
|
|
|
|
ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
|
|
|
|
InFlag);
|
|
|
|
InFlag = ResLo.getValue(2);
|
|
|
|
}
|
|
|
|
ReplaceUses(SDValue(Node, 0), ResLo);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
|
|
|
|
dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
// Copy the high half of the result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 1).use_empty()) {
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!ResHi.getNode()) {
|
2012-09-26 16:22:37 +08:00
|
|
|
assert(HiReg && "Register for high half is not defined!");
|
|
|
|
ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
|
|
|
|
InFlag);
|
|
|
|
InFlag = ResHi.getValue(2);
|
|
|
|
}
|
|
|
|
ReplaceUses(SDValue(Node, 1), ResHi);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
|
|
|
|
dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2017-09-09 13:57:20 +08:00
|
|
|
CurDAG->RemoveDeadNode(Node);
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
case ISD::SDIVREM:
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
case ISD::UDIVREM:
|
|
|
|
case X86ISD::SDIVREM8_SEXT_HREG:
|
|
|
|
case X86ISD::UDIVREM8_ZEXT_HREG: {
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
2006-01-07 07:19:29 +08:00
|
|
|
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
bool isSigned = (Opcode == ISD::SDIVREM ||
|
|
|
|
Opcode == X86ISD::SDIVREM8_SEXT_HREG);
|
2009-08-08 05:33:25 +08:00
|
|
|
if (!isSigned) {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-08-03 00:10:52 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
|
|
|
|
case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
|
|
|
|
case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
|
|
|
|
case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
} else {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
|
|
|
|
case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
|
|
|
|
case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
|
|
|
|
case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
|
2006-01-07 07:19:29 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
}
|
2006-01-07 07:19:29 +08:00
|
|
|
|
2009-12-23 09:45:04 +08:00
|
|
|
unsigned LoReg, HiReg, ClrReg;
|
2013-05-30 21:19:42 +08:00
|
|
|
unsigned SExtOpcode;
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-08-03 00:10:52 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8:
|
2009-12-23 09:45:04 +08:00
|
|
|
LoReg = X86::AL; ClrReg = HiReg = X86::AH;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CBW;
|
|
|
|
break;
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i16:
|
2009-08-03 00:10:52 +08:00
|
|
|
LoReg = X86::AX; HiReg = X86::DX;
|
2013-05-30 21:19:42 +08:00
|
|
|
ClrReg = X86::DX;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CWD;
|
|
|
|
break;
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i32:
|
2009-12-23 09:45:04 +08:00
|
|
|
LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CDQ;
|
|
|
|
break;
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i64:
|
2009-12-23 09:45:04 +08:00
|
|
|
LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CQO;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
2015-10-14 00:23:00 +08:00
|
|
|
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
|
2009-08-03 00:10:52 +08:00
|
|
|
bool signBitIsZero = CurDAG->SignBitIsZero(N0);
|
|
|
|
|
|
|
|
SDValue InFlag;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
|
2009-08-03 00:10:52 +08:00
|
|
|
// Special case for div8, just use a move with zero extension to AX to
|
|
|
|
// clear the upper 8 bits (AH).
|
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
|
2015-10-14 00:23:00 +08:00
|
|
|
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
|
|
|
|
Move =
|
2011-05-21 03:04:40 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
|
2013-04-20 06:22:57 +08:00
|
|
|
MVT::Other, Ops), 0);
|
2009-08-03 00:10:52 +08:00
|
|
|
Chain = Move.getValue(1);
|
|
|
|
ReplaceUses(N0.getValue(1), Chain);
|
2006-11-18 06:10:14 +08:00
|
|
|
} else {
|
2009-08-03 00:10:52 +08:00
|
|
|
Move =
|
2011-05-21 03:04:40 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
|
2009-08-03 00:10:52 +08:00
|
|
|
Chain = CurDAG->getEntryNode();
|
|
|
|
}
|
2011-05-21 03:04:40 +08:00
|
|
|
Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
|
2009-08-03 00:10:52 +08:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
} else {
|
|
|
|
InFlag =
|
|
|
|
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
|
|
|
|
LoReg, N0, SDValue()).getValue(1);
|
|
|
|
if (isSigned && !signBitIsZero) {
|
|
|
|
// Sign extend the low part into the high part.
|
2006-11-18 06:10:14 +08:00
|
|
|
InFlag =
|
2010-12-21 10:38:05 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
|
2009-08-03 00:10:52 +08:00
|
|
|
} else {
|
|
|
|
// Zero out the high part, effectively zero extending the input.
|
2014-12-04 13:20:33 +08:00
|
|
|
SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2013-05-30 21:19:42 +08:00
|
|
|
case MVT::i16:
|
|
|
|
ClrNode =
|
|
|
|
SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(X86::sub_16bit, dl,
|
|
|
|
MVT::i32)),
|
2013-05-30 21:19:42 +08:00
|
|
|
0);
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
break;
|
|
|
|
case MVT::i64:
|
|
|
|
ClrNode =
|
|
|
|
SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
|
|
|
|
CurDAG->getTargetConstant(X86::sub_32bit, dl,
|
|
|
|
MVT::i32)),
|
2013-05-30 21:19:42 +08:00
|
|
|
0);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unexpected division source");
|
|
|
|
}
|
|
|
|
|
2009-12-23 09:45:04 +08:00
|
|
|
InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
|
2009-08-03 00:10:52 +08:00
|
|
|
ClrNode, InFlag).getValue(1);
|
2006-01-07 07:19:29 +08:00
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2006-01-07 07:19:29 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
if (foldedLoad) {
|
|
|
|
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
|
|
|
|
InFlag };
|
2017-11-09 06:26:39 +08:00
|
|
|
MachineSDNode *CNode =
|
2013-04-20 06:22:57 +08:00
|
|
|
CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
|
2009-08-03 00:10:52 +08:00
|
|
|
InFlag = SDValue(CNode, 1);
|
|
|
|
// Update the chain.
|
|
|
|
ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
|
2017-11-09 06:26:39 +08:00
|
|
|
// Record the mem-refs
|
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
|
|
|
|
MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand();
|
|
|
|
CNode->setMemRefs(MemOp, MemOp + 1);
|
2009-08-03 00:10:52 +08:00
|
|
|
} else {
|
|
|
|
InFlag =
|
2010-12-21 10:38:05 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
// Prevent use of AH in a REX instruction by explicitly copying it to
|
|
|
|
// an ABCD_L register.
|
2013-07-09 10:07:28 +08:00
|
|
|
//
|
|
|
|
// The current assumption of the register allocator is that isel
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
// won't generate explicit references to the GR8_ABCD_H registers. If
|
2013-07-09 10:07:28 +08:00
|
|
|
// the allocator and/or the backend get enhanced to be more robust in
|
|
|
|
// that regard, this can be, and should be, removed.
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
|
|
|
|
SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
|
|
|
|
unsigned AHExtOpcode =
|
2018-03-20 13:00:20 +08:00
|
|
|
isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
|
|
|
|
SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
|
|
|
|
MVT::Glue, AHCopy, InFlag);
|
|
|
|
SDValue Result(RNode, 0);
|
|
|
|
InFlag = SDValue(RNode, 1);
|
|
|
|
|
|
|
|
if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
|
|
|
|
Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
|
2017-10-27 05:12:03 +08:00
|
|
|
assert(Node->getValueType(1) == MVT::i32 && "Unexpected result type!");
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
} else {
|
|
|
|
Result =
|
|
|
|
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
|
|
|
|
}
|
|
|
|
ReplaceUses(SDValue(Node, 1), Result);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
|
|
|
|
dbgs() << '\n');
|
2010-06-26 08:39:23 +08:00
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
// Copy the division (low) result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 0).use_empty()) {
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
|
|
|
|
LoReg, NVT, InFlag);
|
|
|
|
InFlag = Result.getValue(2);
|
2010-01-05 09:24:18 +08:00
|
|
|
ReplaceUses(SDValue(Node, 0), Result);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
|
|
|
|
dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
// Copy the remainder (high) result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 1).use_empty()) {
|
2010-06-26 08:39:23 +08:00
|
|
|
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
|
|
|
|
HiReg, NVT, InFlag);
|
|
|
|
InFlag = Result.getValue(2);
|
2010-01-05 09:24:18 +08:00
|
|
|
ReplaceUses(SDValue(Node, 1), Result);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
|
|
|
|
dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2017-09-09 13:57:20 +08:00
|
|
|
CurDAG->RemoveDeadNode(Node);
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
2018-02-12 11:02:02 +08:00
|
|
|
case X86ISD::CMP: {
|
2009-08-20 02:16:17 +08:00
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
|
2010-08-05 06:40:58 +08:00
|
|
|
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
|
2015-10-14 00:23:00 +08:00
|
|
|
hasNoSignedComparisonUses(Node))
|
2010-04-28 16:30:49 +08:00
|
|
|
N0 = N0.getOperand(0);
|
2015-02-12 16:40:34 +08:00
|
|
|
|
2014-08-18 19:59:06 +08:00
|
|
|
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
|
|
|
|
// use a smaller encoding.
|
|
|
|
// Look past the truncate if CMP is the only use of it.
|
2018-02-12 11:02:01 +08:00
|
|
|
if (N0.getOpcode() == ISD::AND &&
|
2011-11-04 05:49:52 +08:00
|
|
|
N0.getNode()->hasOneUse() &&
|
2009-08-20 02:16:17 +08:00
|
|
|
N0.getValueType() != MVT::i8 &&
|
|
|
|
X86::isZeroNode(N1)) {
|
2017-05-12 21:08:45 +08:00
|
|
|
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
|
2009-08-20 02:16:17 +08:00
|
|
|
if (!C) break;
|
2017-08-25 13:04:34 +08:00
|
|
|
uint64_t Mask = C->getZExtValue();
|
2009-08-20 02:16:17 +08:00
|
|
|
|
2018-02-01 03:20:06 +08:00
|
|
|
MVT VT;
|
|
|
|
int SubRegOp;
|
|
|
|
unsigned Op;
|
|
|
|
|
2017-08-25 13:04:34 +08:00
|
|
|
if (isUInt<8>(Mask) &&
|
|
|
|
(!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) {
|
2018-02-01 03:20:06 +08:00
|
|
|
// For example, convert "testl %eax, $8" to "testb %al, $8"
|
|
|
|
VT = MVT::i8;
|
|
|
|
SubRegOp = X86::sub_8bit;
|
|
|
|
Op = X86::TEST8ri;
|
|
|
|
} else if (OptForMinSize && isUInt<16>(Mask) &&
|
|
|
|
(!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
|
|
|
|
// For example, "testl %eax, $32776" to "testw %ax, $32776".
|
|
|
|
// NOTE: We only want to form TESTW instructions if optimizing for
|
|
|
|
// min size. Otherwise we only save one byte and possibly get a length
|
|
|
|
// changing prefix penalty in the decoders.
|
|
|
|
VT = MVT::i16;
|
|
|
|
SubRegOp = X86::sub_16bit;
|
|
|
|
Op = X86::TEST16ri;
|
|
|
|
} else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
|
|
|
|
(!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
|
|
|
|
// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
|
|
|
|
// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
|
|
|
|
// Otherwize, we find ourselves in a position where we have to do
|
|
|
|
// promotion. If previous passes did not promote the and, we assume
|
|
|
|
// they had a good reason not to and do not promote here.
|
|
|
|
VT = MVT::i32;
|
|
|
|
SubRegOp = X86::sub_32bit;
|
|
|
|
Op = X86::TEST32ri;
|
|
|
|
} else {
|
|
|
|
// No eligible transformation was found.
|
|
|
|
break;
|
2009-08-20 02:16:17 +08:00
|
|
|
}
|
|
|
|
|
2018-02-01 03:20:06 +08:00
|
|
|
SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
|
|
|
|
SDValue Reg = N0.getOperand(0);
|
2018-01-30 22:18:33 +08:00
|
|
|
|
2018-02-01 03:20:06 +08:00
|
|
|
// Extract the subregister if necessary.
|
|
|
|
if (N0.getValueType() != VT)
|
|
|
|
Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
|
|
|
|
|
|
|
|
// Emit a testl or testw.
|
|
|
|
SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
|
2018-02-12 11:02:02 +08:00
|
|
|
// Replace CMP with TEST.
|
2018-03-20 04:19:46 +08:00
|
|
|
ReplaceNode(Node, NewNode);
|
2018-02-01 03:20:06 +08:00
|
|
|
return;
|
2009-08-20 02:16:17 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2018-04-28 06:15:33 +08:00
|
|
|
case X86ISD::PCMPISTR: {
|
|
|
|
if (!Subtarget->hasSSE42())
|
|
|
|
break;
|
|
|
|
|
|
|
|
bool NeedIndex = !SDValue(Node, 0).use_empty();
|
|
|
|
bool NeedMask = !SDValue(Node, 1).use_empty();
|
|
|
|
// We can't fold a load if we are going to make two instructions.
|
|
|
|
bool MayFoldLoad = !NeedIndex || !NeedMask;
|
|
|
|
|
|
|
|
MachineSDNode *CNode;
|
|
|
|
if (NeedMask) {
|
|
|
|
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
|
|
|
|
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
|
|
|
|
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
|
|
|
|
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
|
|
|
|
}
|
|
|
|
if (NeedIndex || !NeedMask) {
|
|
|
|
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
|
|
|
|
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
|
|
|
|
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
|
|
|
|
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Connect the flag usage to the last instruction created.
|
|
|
|
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 0));
|
|
|
|
CurDAG->RemoveDeadNode(Node);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case X86ISD::PCMPESTR: {
|
|
|
|
if (!Subtarget->hasSSE42())
|
|
|
|
break;
|
|
|
|
|
|
|
|
// Copy the two implicit register inputs.
|
|
|
|
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
|
|
|
|
Node->getOperand(1),
|
|
|
|
SDValue()).getValue(1);
|
|
|
|
InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
|
|
|
|
Node->getOperand(3), InFlag).getValue(1);
|
|
|
|
|
|
|
|
bool NeedIndex = !SDValue(Node, 0).use_empty();
|
|
|
|
bool NeedMask = !SDValue(Node, 1).use_empty();
|
|
|
|
// We can't fold a load if we are going to make two instructions.
|
|
|
|
bool MayFoldLoad = !NeedIndex || !NeedMask;
|
|
|
|
|
|
|
|
MachineSDNode *CNode;
|
|
|
|
if (NeedMask) {
|
|
|
|
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
|
|
|
|
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
|
|
|
|
CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
|
|
|
|
InFlag);
|
|
|
|
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
|
|
|
|
}
|
|
|
|
if (NeedIndex || !NeedMask) {
|
|
|
|
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
|
|
|
|
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
|
|
|
|
CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
|
|
|
|
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
|
|
|
|
}
|
|
|
|
// Connect the flag usage to the last instruction created.
|
|
|
|
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
|
|
|
|
CurDAG->RemoveDeadNode(Node);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-08-25 10:04:03 +08:00
|
|
|
case ISD::STORE:
|
|
|
|
if (foldLoadStoreIntoMemOperand(Node))
|
|
|
|
return;
|
|
|
|
break;
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|
|
|
|
|
2016-05-11 07:55:37 +08:00
|
|
|
SelectCode(Node);
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|
|
|
|
|
2006-06-09 02:03:49 +08:00
|
|
|
bool X86DAGToDAGISel::
|
2015-03-13 20:45:09 +08:00
|
|
|
SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
|
2008-08-23 10:25:05 +08:00
|
|
|
std::vector<SDValue> &OutOps) {
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue Op0, Op1, Op2, Op3, Op4;
|
2015-03-13 20:45:09 +08:00
|
|
|
switch (ConstraintID) {
|
2015-05-16 20:09:54 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Unexpected asm memory constraint");
|
|
|
|
case InlineAsm::Constraint_i:
|
|
|
|
// FIXME: It seems strange that 'i' is needed here since it's supposed to
|
|
|
|
// be an immediate and not a memory constraint.
|
2016-08-17 13:10:15 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
2015-03-13 20:45:09 +08:00
|
|
|
case InlineAsm::Constraint_o: // offsetable ??
|
|
|
|
case InlineAsm::Constraint_v: // not offsetable ??
|
|
|
|
case InlineAsm::Constraint_m: // memory
|
2015-05-16 20:09:54 +08:00
|
|
|
case InlineAsm::Constraint_X:
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
|
2006-06-09 02:03:49 +08:00
|
|
|
return true;
|
|
|
|
break;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2006-08-26 09:05:16 +08:00
|
|
|
OutOps.push_back(Op0);
|
|
|
|
OutOps.push_back(Op1);
|
|
|
|
OutOps.push_back(Op2);
|
|
|
|
OutOps.push_back(Op3);
|
2009-04-09 05:14:34 +08:00
|
|
|
OutOps.push_back(Op4);
|
2006-06-09 02:03:49 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// This pass converts a legalized DAG into a X86-specific DAG,
|
|
|
|
/// ready for instruction scheduling.
|
2009-04-30 07:29:43 +08:00
|
|
|
FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
|
2012-03-27 15:21:54 +08:00
|
|
|
CodeGenOpt::Level OptLevel) {
|
2009-04-29 08:15:41 +08:00
|
|
|
return new X86DAGToDAGISel(TM, OptLevel);
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|