2012-02-18 20:03:15 +08:00
|
|
|
//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
|
2005-04-22 07:38:14 +08:00
|
|
|
//
|
2003-10-21 03:43:21 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-04-22 07:38:14 +08:00
|
|
|
//
|
2003-10-21 03:43:21 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
2002-10-26 06:55:53 +08:00
|
|
|
//
|
2003-01-15 06:00:31 +08:00
|
|
|
// This file contains the X86 implementation of the TargetInstrInfo class.
|
2002-10-26 06:55:53 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2002-10-30 05:05:24 +08:00
|
|
|
#include "X86InstrInfo.h"
|
2002-12-03 13:42:53 +08:00
|
|
|
#include "X86.h"
|
2005-01-02 10:37:07 +08:00
|
|
|
#include "X86InstrBuilder.h"
|
2008-01-05 07:57:37 +08:00
|
|
|
#include "X86MachineFunctionInfo.h"
|
2006-05-31 05:45:53 +08:00
|
|
|
#include "X86Subtarget.h"
|
|
|
|
#include "X86TargetMachine.h"
|
2007-09-07 12:06:50 +08:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/CodeGen/LiveVariables.h"
|
2008-12-03 13:21:24 +08:00
|
|
|
#include "llvm/CodeGen/MachineConstantPool.h"
|
2012-06-02 00:27:21 +08:00
|
|
|
#include "llvm/CodeGen/MachineDominators.h"
|
2008-01-05 07:57:37 +08:00
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
2003-05-24 08:09:50 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2007-12-31 12:13:23 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2013-11-01 06:11:56 +08:00
|
|
|
#include "llvm/CodeGen/StackMaps.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2014-09-03 06:28:02 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2012-03-18 02:46:09 +08:00
|
|
|
#include "llvm/MC/MCAsmInfo.h"
|
2014-06-06 03:29:43 +08:00
|
|
|
#include "llvm/MC/MCExpr.h"
|
2010-04-27 07:37:21 +08:00
|
|
|
#include "llvm/MC/MCInst.h"
|
2008-01-07 09:35:02 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2010-01-05 09:29:29 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2009-07-09 02:01:40 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2007-09-25 09:57:46 +08:00
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2009-11-13 04:55:29 +08:00
|
|
|
#include <limits>
|
|
|
|
|
2014-04-22 10:03:14 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 06:55:11 +08:00
|
|
|
#define DEBUG_TYPE "x86-instr-info"
|
|
|
|
|
2013-11-19 08:57:56 +08:00
|
|
|
#define GET_INSTRINFO_CTOR_DTOR
|
2011-06-29 04:07:07 +08:00
|
|
|
#include "X86GenInstrInfo.inc"
|
|
|
|
|
2009-08-23 11:41:05 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
NoFusing("disable-spill-fusing",
|
|
|
|
cl::desc("Disable fusing of spill code into instructions"));
|
|
|
|
static cl::opt<bool>
|
|
|
|
PrintFailedFusing("print-failed-fuse-candidates",
|
|
|
|
cl::desc("Print instructions that the allocator wants to"
|
|
|
|
" fuse, but the X86 backend currently can't"),
|
|
|
|
cl::Hidden);
|
|
|
|
static cl::opt<bool>
|
|
|
|
ReMatPICStubLoad("remat-pic-stub-load",
|
|
|
|
cl::desc("Re-materialize load from stub in PIC mode"),
|
|
|
|
cl::init(false), cl::Hidden);
|
2008-01-07 09:35:02 +08:00
|
|
|
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
enum {
|
|
|
|
// Select which memory operand is being unfolded.
|
2012-06-23 16:01:18 +08:00
|
|
|
// (stored in bits 0 - 3)
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
TB_INDEX_0 = 0,
|
|
|
|
TB_INDEX_1 = 1,
|
|
|
|
TB_INDEX_2 = 2,
|
2012-05-31 17:20:20 +08:00
|
|
|
TB_INDEX_3 = 3,
|
2014-12-18 20:28:22 +08:00
|
|
|
TB_INDEX_4 = 4,
|
2012-06-23 16:01:18 +08:00
|
|
|
TB_INDEX_MASK = 0xf,
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
|
|
|
|
// Do not insert the reverse map (MemOp -> RegOp) into the table.
|
|
|
|
// This may be needed because there is a many -> one mapping.
|
2012-06-23 16:01:18 +08:00
|
|
|
TB_NO_REVERSE = 1 << 4,
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
|
|
|
|
// Do not insert the forward map (RegOp -> MemOp) into the table.
|
|
|
|
// This is needed for Native Client, which prohibits branch
|
|
|
|
// instructions from using a memory operand.
|
2012-06-23 16:01:18 +08:00
|
|
|
TB_NO_FORWARD = 1 << 5,
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
|
2012-06-23 16:01:18 +08:00
|
|
|
TB_FOLDED_LOAD = 1 << 6,
|
|
|
|
TB_FOLDED_STORE = 1 << 7,
|
|
|
|
|
|
|
|
// Minimum alignment required for load/store.
|
|
|
|
// Used for RegOp->MemOp conversion.
|
|
|
|
// (stored in bits 8 - 15)
|
|
|
|
TB_ALIGN_SHIFT = 8,
|
|
|
|
TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
|
|
|
|
TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
|
|
|
|
TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
|
2013-08-11 15:55:09 +08:00
|
|
|
TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
|
2012-06-23 16:01:18 +08:00
|
|
|
TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
struct X86MemoryFoldTableEntry {
|
2012-03-09 15:45:21 +08:00
|
|
|
uint16_t RegOp;
|
|
|
|
uint16_t MemOp;
|
2012-06-23 16:01:18 +08:00
|
|
|
uint16_t Flags;
|
2012-03-09 15:45:21 +08:00
|
|
|
};
|
|
|
|
|
2013-11-19 08:57:56 +08:00
|
|
|
// Pin the vtable to this file.
|
|
|
|
void X86InstrInfo::anchor() {}
|
|
|
|
|
2014-06-11 06:34:31 +08:00
|
|
|
X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
|
|
|
: X86GenInstrInfo(
|
2014-09-22 21:11:35 +08:00
|
|
|
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
|
|
|
|
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
|
2015-03-13 01:54:19 +08:00
|
|
|
Subtarget(STI), RI(STI.getTargetTriple()) {
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::ADC32ri, X86::ADC32mi, 0 },
|
|
|
|
{ X86::ADC32ri8, X86::ADC32mi8, 0 },
|
|
|
|
{ X86::ADC32rr, X86::ADC32mr, 0 },
|
|
|
|
{ X86::ADC64ri32, X86::ADC64mi32, 0 },
|
|
|
|
{ X86::ADC64ri8, X86::ADC64mi8, 0 },
|
|
|
|
{ X86::ADC64rr, X86::ADC64mr, 0 },
|
|
|
|
{ X86::ADD16ri, X86::ADD16mi, 0 },
|
|
|
|
{ X86::ADD16ri8, X86::ADD16mi8, 0 },
|
|
|
|
{ X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD16rr, X86::ADD16mr, 0 },
|
|
|
|
{ X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD32ri, X86::ADD32mi, 0 },
|
|
|
|
{ X86::ADD32ri8, X86::ADD32mi8, 0 },
|
|
|
|
{ X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD32rr, X86::ADD32mr, 0 },
|
|
|
|
{ X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD64ri32, X86::ADD64mi32, 0 },
|
|
|
|
{ X86::ADD64ri8, X86::ADD64mi8, 0 },
|
|
|
|
{ X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD64rr, X86::ADD64mr, 0 },
|
|
|
|
{ X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD8ri, X86::ADD8mi, 0 },
|
|
|
|
{ X86::ADD8rr, X86::ADD8mr, 0 },
|
|
|
|
{ X86::AND16ri, X86::AND16mi, 0 },
|
|
|
|
{ X86::AND16ri8, X86::AND16mi8, 0 },
|
|
|
|
{ X86::AND16rr, X86::AND16mr, 0 },
|
|
|
|
{ X86::AND32ri, X86::AND32mi, 0 },
|
|
|
|
{ X86::AND32ri8, X86::AND32mi8, 0 },
|
|
|
|
{ X86::AND32rr, X86::AND32mr, 0 },
|
|
|
|
{ X86::AND64ri32, X86::AND64mi32, 0 },
|
|
|
|
{ X86::AND64ri8, X86::AND64mi8, 0 },
|
|
|
|
{ X86::AND64rr, X86::AND64mr, 0 },
|
|
|
|
{ X86::AND8ri, X86::AND8mi, 0 },
|
|
|
|
{ X86::AND8rr, X86::AND8mr, 0 },
|
|
|
|
{ X86::DEC16r, X86::DEC16m, 0 },
|
|
|
|
{ X86::DEC32r, X86::DEC32m, 0 },
|
|
|
|
{ X86::DEC64r, X86::DEC64m, 0 },
|
|
|
|
{ X86::DEC8r, X86::DEC8m, 0 },
|
|
|
|
{ X86::INC16r, X86::INC16m, 0 },
|
|
|
|
{ X86::INC32r, X86::INC32m, 0 },
|
|
|
|
{ X86::INC64r, X86::INC64m, 0 },
|
|
|
|
{ X86::INC8r, X86::INC8m, 0 },
|
|
|
|
{ X86::NEG16r, X86::NEG16m, 0 },
|
|
|
|
{ X86::NEG32r, X86::NEG32m, 0 },
|
|
|
|
{ X86::NEG64r, X86::NEG64m, 0 },
|
|
|
|
{ X86::NEG8r, X86::NEG8m, 0 },
|
|
|
|
{ X86::NOT16r, X86::NOT16m, 0 },
|
|
|
|
{ X86::NOT32r, X86::NOT32m, 0 },
|
|
|
|
{ X86::NOT64r, X86::NOT64m, 0 },
|
|
|
|
{ X86::NOT8r, X86::NOT8m, 0 },
|
|
|
|
{ X86::OR16ri, X86::OR16mi, 0 },
|
|
|
|
{ X86::OR16ri8, X86::OR16mi8, 0 },
|
|
|
|
{ X86::OR16rr, X86::OR16mr, 0 },
|
|
|
|
{ X86::OR32ri, X86::OR32mi, 0 },
|
|
|
|
{ X86::OR32ri8, X86::OR32mi8, 0 },
|
|
|
|
{ X86::OR32rr, X86::OR32mr, 0 },
|
|
|
|
{ X86::OR64ri32, X86::OR64mi32, 0 },
|
|
|
|
{ X86::OR64ri8, X86::OR64mi8, 0 },
|
|
|
|
{ X86::OR64rr, X86::OR64mr, 0 },
|
|
|
|
{ X86::OR8ri, X86::OR8mi, 0 },
|
|
|
|
{ X86::OR8rr, X86::OR8mr, 0 },
|
|
|
|
{ X86::ROL16r1, X86::ROL16m1, 0 },
|
|
|
|
{ X86::ROL16rCL, X86::ROL16mCL, 0 },
|
|
|
|
{ X86::ROL16ri, X86::ROL16mi, 0 },
|
|
|
|
{ X86::ROL32r1, X86::ROL32m1, 0 },
|
|
|
|
{ X86::ROL32rCL, X86::ROL32mCL, 0 },
|
|
|
|
{ X86::ROL32ri, X86::ROL32mi, 0 },
|
|
|
|
{ X86::ROL64r1, X86::ROL64m1, 0 },
|
|
|
|
{ X86::ROL64rCL, X86::ROL64mCL, 0 },
|
|
|
|
{ X86::ROL64ri, X86::ROL64mi, 0 },
|
|
|
|
{ X86::ROL8r1, X86::ROL8m1, 0 },
|
|
|
|
{ X86::ROL8rCL, X86::ROL8mCL, 0 },
|
|
|
|
{ X86::ROL8ri, X86::ROL8mi, 0 },
|
|
|
|
{ X86::ROR16r1, X86::ROR16m1, 0 },
|
|
|
|
{ X86::ROR16rCL, X86::ROR16mCL, 0 },
|
|
|
|
{ X86::ROR16ri, X86::ROR16mi, 0 },
|
|
|
|
{ X86::ROR32r1, X86::ROR32m1, 0 },
|
|
|
|
{ X86::ROR32rCL, X86::ROR32mCL, 0 },
|
|
|
|
{ X86::ROR32ri, X86::ROR32mi, 0 },
|
|
|
|
{ X86::ROR64r1, X86::ROR64m1, 0 },
|
|
|
|
{ X86::ROR64rCL, X86::ROR64mCL, 0 },
|
|
|
|
{ X86::ROR64ri, X86::ROR64mi, 0 },
|
|
|
|
{ X86::ROR8r1, X86::ROR8m1, 0 },
|
|
|
|
{ X86::ROR8rCL, X86::ROR8mCL, 0 },
|
|
|
|
{ X86::ROR8ri, X86::ROR8mi, 0 },
|
|
|
|
{ X86::SAR16r1, X86::SAR16m1, 0 },
|
|
|
|
{ X86::SAR16rCL, X86::SAR16mCL, 0 },
|
|
|
|
{ X86::SAR16ri, X86::SAR16mi, 0 },
|
|
|
|
{ X86::SAR32r1, X86::SAR32m1, 0 },
|
|
|
|
{ X86::SAR32rCL, X86::SAR32mCL, 0 },
|
|
|
|
{ X86::SAR32ri, X86::SAR32mi, 0 },
|
|
|
|
{ X86::SAR64r1, X86::SAR64m1, 0 },
|
|
|
|
{ X86::SAR64rCL, X86::SAR64mCL, 0 },
|
|
|
|
{ X86::SAR64ri, X86::SAR64mi, 0 },
|
|
|
|
{ X86::SAR8r1, X86::SAR8m1, 0 },
|
|
|
|
{ X86::SAR8rCL, X86::SAR8mCL, 0 },
|
|
|
|
{ X86::SAR8ri, X86::SAR8mi, 0 },
|
|
|
|
{ X86::SBB32ri, X86::SBB32mi, 0 },
|
|
|
|
{ X86::SBB32ri8, X86::SBB32mi8, 0 },
|
|
|
|
{ X86::SBB32rr, X86::SBB32mr, 0 },
|
|
|
|
{ X86::SBB64ri32, X86::SBB64mi32, 0 },
|
|
|
|
{ X86::SBB64ri8, X86::SBB64mi8, 0 },
|
|
|
|
{ X86::SBB64rr, X86::SBB64mr, 0 },
|
|
|
|
{ X86::SHL16rCL, X86::SHL16mCL, 0 },
|
|
|
|
{ X86::SHL16ri, X86::SHL16mi, 0 },
|
|
|
|
{ X86::SHL32rCL, X86::SHL32mCL, 0 },
|
|
|
|
{ X86::SHL32ri, X86::SHL32mi, 0 },
|
|
|
|
{ X86::SHL64rCL, X86::SHL64mCL, 0 },
|
|
|
|
{ X86::SHL64ri, X86::SHL64mi, 0 },
|
|
|
|
{ X86::SHL8rCL, X86::SHL8mCL, 0 },
|
|
|
|
{ X86::SHL8ri, X86::SHL8mi, 0 },
|
|
|
|
{ X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
|
|
|
|
{ X86::SHLD16rri8, X86::SHLD16mri8, 0 },
|
|
|
|
{ X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
|
|
|
|
{ X86::SHLD32rri8, X86::SHLD32mri8, 0 },
|
|
|
|
{ X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
|
|
|
|
{ X86::SHLD64rri8, X86::SHLD64mri8, 0 },
|
|
|
|
{ X86::SHR16r1, X86::SHR16m1, 0 },
|
|
|
|
{ X86::SHR16rCL, X86::SHR16mCL, 0 },
|
|
|
|
{ X86::SHR16ri, X86::SHR16mi, 0 },
|
|
|
|
{ X86::SHR32r1, X86::SHR32m1, 0 },
|
|
|
|
{ X86::SHR32rCL, X86::SHR32mCL, 0 },
|
|
|
|
{ X86::SHR32ri, X86::SHR32mi, 0 },
|
|
|
|
{ X86::SHR64r1, X86::SHR64m1, 0 },
|
|
|
|
{ X86::SHR64rCL, X86::SHR64mCL, 0 },
|
|
|
|
{ X86::SHR64ri, X86::SHR64mi, 0 },
|
|
|
|
{ X86::SHR8r1, X86::SHR8m1, 0 },
|
|
|
|
{ X86::SHR8rCL, X86::SHR8mCL, 0 },
|
|
|
|
{ X86::SHR8ri, X86::SHR8mi, 0 },
|
|
|
|
{ X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
|
|
|
|
{ X86::SHRD16rri8, X86::SHRD16mri8, 0 },
|
|
|
|
{ X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
|
|
|
|
{ X86::SHRD32rri8, X86::SHRD32mri8, 0 },
|
|
|
|
{ X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
|
|
|
|
{ X86::SHRD64rri8, X86::SHRD64mri8, 0 },
|
|
|
|
{ X86::SUB16ri, X86::SUB16mi, 0 },
|
|
|
|
{ X86::SUB16ri8, X86::SUB16mi8, 0 },
|
|
|
|
{ X86::SUB16rr, X86::SUB16mr, 0 },
|
|
|
|
{ X86::SUB32ri, X86::SUB32mi, 0 },
|
|
|
|
{ X86::SUB32ri8, X86::SUB32mi8, 0 },
|
|
|
|
{ X86::SUB32rr, X86::SUB32mr, 0 },
|
|
|
|
{ X86::SUB64ri32, X86::SUB64mi32, 0 },
|
|
|
|
{ X86::SUB64ri8, X86::SUB64mi8, 0 },
|
|
|
|
{ X86::SUB64rr, X86::SUB64mr, 0 },
|
|
|
|
{ X86::SUB8ri, X86::SUB8mi, 0 },
|
|
|
|
{ X86::SUB8rr, X86::SUB8mr, 0 },
|
|
|
|
{ X86::XOR16ri, X86::XOR16mi, 0 },
|
|
|
|
{ X86::XOR16ri8, X86::XOR16mi8, 0 },
|
|
|
|
{ X86::XOR16rr, X86::XOR16mr, 0 },
|
|
|
|
{ X86::XOR32ri, X86::XOR32mi, 0 },
|
|
|
|
{ X86::XOR32ri8, X86::XOR32mi8, 0 },
|
|
|
|
{ X86::XOR32rr, X86::XOR32mr, 0 },
|
|
|
|
{ X86::XOR64ri32, X86::XOR64mi32, 0 },
|
|
|
|
{ X86::XOR64ri8, X86::XOR64mi8, 0 },
|
|
|
|
{ X86::XOR64rr, X86::XOR64mr, 0 },
|
|
|
|
{ X86::XOR8ri, X86::XOR8mi, 0 },
|
|
|
|
{ X86::XOR8rr, X86::XOR8mr, 0 }
|
2008-01-07 09:35:02 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) {
|
|
|
|
unsigned RegOp = MemoryFoldTable2Addr[i].RegOp;
|
|
|
|
unsigned MemOp = MemoryFoldTable2Addr[i].MemOp;
|
|
|
|
unsigned Flags = MemoryFoldTable2Addr[i].Flags;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
|
|
|
|
RegOp, MemOp,
|
|
|
|
// Index 0, folded load and store, no alignment requirement.
|
|
|
|
Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
|
|
|
|
}
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
|
|
|
|
{ X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
|
|
|
|
{ X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
|
|
|
|
{ X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
|
|
|
|
{ X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
|
2012-12-26 09:47:12 +08:00
|
|
|
{ X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
|
|
|
|
{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
|
|
|
|
{ X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
|
|
|
|
{ X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
|
|
|
|
{ X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
|
2015-01-31 05:03:31 +08:00
|
|
|
{ X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
|
|
|
|
{ X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
|
|
|
|
{ X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2011-09-14 10:36:58 +08:00
|
|
|
// AVX 128-bit versions of foldable instructions
|
2012-12-26 09:47:12 +08:00
|
|
|
{ X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
|
2012-01-15 02:14:53 +08:00
|
|
|
{ X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2011-09-14 10:36:58 +08:00
|
|
|
// AVX 256-bit foldable instructions
|
2012-01-15 02:14:53 +08:00
|
|
|
{ X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
|
2013-09-02 15:12:29 +08:00
|
|
|
{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2013-09-02 15:12:29 +08:00
|
|
|
// AVX-512 foldable instructions
|
2014-08-06 23:40:34 +08:00
|
|
|
{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
|
|
|
|
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
|
|
|
|
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
|
|
|
|
{ X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
|
|
|
|
{ X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
|
2014-09-26 17:48:50 +08:00
|
|
|
{ X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
|
2014-08-06 23:40:34 +08:00
|
|
|
{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
|
2014-09-26 17:48:50 +08:00
|
|
|
{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2014-09-26 17:48:50 +08:00
|
|
|
// AVX-512 foldable instructions (256-bit versions)
|
|
|
|
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2014-09-26 17:48:50 +08:00
|
|
|
// AVX-512 foldable instructions (128-bit versions)
|
|
|
|
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
// F16C foldable instructions
|
|
|
|
{ X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
|
|
|
|
{ X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
|
2008-01-07 09:35:02 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) {
|
|
|
|
unsigned RegOp = MemoryFoldTable0[i].RegOp;
|
|
|
|
unsigned MemOp = MemoryFoldTable0[i].MemOp;
|
|
|
|
unsigned Flags = MemoryFoldTable0[i].Flags;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
|
|
|
|
RegOp, MemOp, TB_INDEX_0 | Flags);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::CMP16rr, X86::CMP16rm, 0 },
|
|
|
|
{ X86::CMP32rr, X86::CMP32rm, 0 },
|
|
|
|
{ X86::CMP64rr, X86::CMP64rm, 0 },
|
|
|
|
{ X86::CMP8rr, X86::CMP8rm, 0 },
|
|
|
|
{ X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
|
|
|
|
{ X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
|
|
|
|
{ X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
|
|
|
|
{ X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
|
|
|
|
{ X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
|
|
|
|
{ X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
|
|
|
|
{ X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
|
|
|
|
{ X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
|
|
|
|
{ X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
|
|
|
|
{ X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
|
|
|
|
{ X86::IMUL16rri, X86::IMUL16rmi, 0 },
|
|
|
|
{ X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
|
|
|
|
{ X86::IMUL32rri, X86::IMUL32rmi, 0 },
|
|
|
|
{ X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
|
|
|
|
{ X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
|
|
|
|
{ X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
|
|
|
|
{ X86::Int_COMISDrr, X86::Int_COMISDrm, 0 },
|
|
|
|
{ X86::Int_COMISSrr, X86::Int_COMISSrm, 0 },
|
|
|
|
{ X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
|
|
|
|
{ X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
|
2012-06-15 15:02:58 +08:00
|
|
|
{ X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
|
|
|
|
{ X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 },
|
2014-11-06 06:28:25 +08:00
|
|
|
{ X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
|
2014-12-17 06:30:10 +08:00
|
|
|
{ X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
|
|
|
|
{ X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
|
|
|
|
{ X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
|
|
|
|
{ X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
|
|
|
|
{ X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 },
|
|
|
|
{ X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 },
|
|
|
|
{ X86::MOV16rr, X86::MOV16rm, 0 },
|
|
|
|
{ X86::MOV32rr, X86::MOV32rm, 0 },
|
|
|
|
{ X86::MOV64rr, X86::MOV64rm, 0 },
|
|
|
|
{ X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
|
|
|
|
{ X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
|
|
|
|
{ X86::MOV8rr, X86::MOV8rm, 0 },
|
|
|
|
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
|
|
|
|
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
|
|
|
|
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
|
|
|
|
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
|
|
|
|
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
|
|
|
|
{ X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
|
|
|
|
{ X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
|
|
|
|
{ X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
|
|
|
|
{ X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
|
|
|
|
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
|
|
|
|
{ X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
|
|
|
|
{ X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 },
|
|
|
|
{ X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
|
|
|
|
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
|
|
|
|
{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
|
|
|
|
{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
|
|
|
|
{ X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
|
|
|
|
{ X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
|
|
|
|
{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
|
|
|
|
{ X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
|
|
|
|
{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
|
|
|
|
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
|
[x86] eliminate unnecessary shuffling/moves with unary scalar math ops (PR21507)
Finish the job that was abandoned in D6958 following the refactoring in
http://reviews.llvm.org/rL230221:
1. Uncomment the intrinsic def for the AVX r_Int instruction.
2. Add missing r_Int entries to the load folding tables; there are already
tests that check these in "test/Codegen/X86/fold-load-unops.ll", so I
haven't added any more in this patch.
3. Add patterns to solve PR21507 ( https://llvm.org/bugs/show_bug.cgi?id=21507 ).
So instead of this:
movaps %xmm0, %xmm1
rcpss %xmm1, %xmm1
movss %xmm1, %xmm0
We should now get:
rcpss %xmm0, %xmm0
And instead of this:
vsqrtss %xmm0, %xmm0, %xmm1
vblendps $1, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm1[0],xmm0[1,2,3]
We should now get:
vsqrtss %xmm0, %xmm0, %xmm0
Differential Revision: http://reviews.llvm.org/D9504
llvm-svn: 236740
2015-05-07 23:48:53 +08:00
|
|
|
{ X86::RCPSSr, X86::RCPSSm, 0 },
|
|
|
|
{ X86::RCPSSr_Int, X86::RCPSSm_Int, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
|
|
|
|
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
|
|
|
|
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
|
|
|
|
{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
|
|
|
|
{ X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
|
|
|
|
{ X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
|
|
|
|
{ X86::SQRTSDr, X86::SQRTSDm, 0 },
|
|
|
|
{ X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 },
|
|
|
|
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
|
|
|
|
{ X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 },
|
|
|
|
{ X86::TEST16rr, X86::TEST16rm, 0 },
|
|
|
|
{ X86::TEST32rr, X86::TEST32rm, 0 },
|
|
|
|
{ X86::TEST64rr, X86::TEST64rm, 0 },
|
|
|
|
{ X86::TEST8rr, X86::TEST8rm, 0 },
|
2008-01-07 09:35:02 +08:00
|
|
|
// FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::UCOMISDrr, X86::UCOMISDrm, 0 },
|
|
|
|
{ X86::UCOMISSrr, X86::UCOMISSrm, 0 },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2015-02-25 23:14:02 +08:00
|
|
|
// MMX version of foldable instructions
|
|
|
|
{ X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 },
|
|
|
|
{ X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
|
|
|
|
{ X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 },
|
|
|
|
{ X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 },
|
|
|
|
{ X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 },
|
|
|
|
{ X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
|
|
|
|
{ X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 },
|
|
|
|
{ X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 },
|
|
|
|
{ X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
|
|
|
|
{ X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
|
|
|
|
|
2015-04-03 19:50:30 +08:00
|
|
|
// 3DNow! version of foldable instructions
|
|
|
|
{ X86::PF2IDrr, X86::PF2IDrm, 0 },
|
|
|
|
{ X86::PF2IWrr, X86::PF2IWrm, 0 },
|
|
|
|
{ X86::PFRCPrr, X86::PFRCPrm, 0 },
|
|
|
|
{ X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
|
|
|
|
{ X86::PI2FDrr, X86::PI2FDrm, 0 },
|
|
|
|
{ X86::PI2FWrr, X86::PI2FWrm, 0 },
|
|
|
|
{ X86::PSWAPDrr, X86::PSWAPDrm, 0 },
|
|
|
|
|
2011-09-14 10:36:58 +08:00
|
|
|
// AVX 128-bit versions of foldable instructions
|
|
|
|
{ X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
|
|
|
|
{ X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
|
|
|
|
{ X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 },
|
|
|
|
{ X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 },
|
2012-06-15 15:02:58 +08:00
|
|
|
{ X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
|
|
|
|
{ X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
|
2012-06-15 06:12:58 +08:00
|
|
|
{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
|
2012-06-15 15:02:58 +08:00
|
|
|
{ X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 },
|
|
|
|
{ X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
|
|
|
|
{ X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
|
|
|
|
{ X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
|
|
|
|
{ X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 },
|
|
|
|
{ X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
|
|
|
|
{ X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
|
|
|
|
{ X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
|
|
|
|
{ X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 },
|
2014-11-06 06:28:25 +08:00
|
|
|
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 },
|
2014-12-17 06:30:10 +08:00
|
|
|
{ X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 },
|
|
|
|
{ X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
|
|
|
|
{ X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
|
|
|
|
{ X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
|
|
|
|
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
|
|
|
|
{ X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
|
|
|
|
{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
|
2015-01-23 06:39:59 +08:00
|
|
|
{ X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
|
|
|
|
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
|
2012-12-26 10:14:19 +08:00
|
|
|
{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
|
|
|
|
{ X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 },
|
|
|
|
{ X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VPABSBrr128, X86::VPABSBrm128, 0 },
|
|
|
|
{ X86::VPABSDrr128, X86::VPABSDrm128, 0 },
|
|
|
|
{ X86::VPABSWrr128, X86::VPABSWrm128, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
|
|
|
|
{ X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
|
|
|
|
{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
|
|
|
|
{ X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
|
|
|
|
{ X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
|
|
|
|
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 },
|
|
|
|
{ X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 },
|
|
|
|
{ X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 },
|
|
|
|
{ X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 },
|
|
|
|
{ X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 },
|
|
|
|
{ X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 },
|
|
|
|
{ X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 },
|
|
|
|
{ X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 },
|
|
|
|
{ X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 },
|
|
|
|
{ X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 },
|
|
|
|
{ X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 },
|
|
|
|
{ X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
|
|
|
|
{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
|
|
|
|
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPTESTrr, X86::VPTESTrm, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VRCPPSr, X86::VRCPPSm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VROUNDPDr, X86::VROUNDPDm, 0 },
|
|
|
|
{ X86::VROUNDPSr, X86::VROUNDPSm, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
|
|
|
|
{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },
|
|
|
|
{ X86::VSQRTPSr, X86::VSQRTPSm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VTESTPDrr, X86::VTESTPDrm, 0 },
|
|
|
|
{ X86::VTESTPSrr, X86::VTESTPSrm, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
|
2012-07-15 20:26:30 +08:00
|
|
|
|
2011-09-14 10:36:58 +08:00
|
|
|
// AVX 256-bit foldable instructions
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
|
2014-11-06 06:28:25 +08:00
|
|
|
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
|
2014-12-17 06:30:10 +08:00
|
|
|
{ X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
|
2014-11-07 06:15:41 +08:00
|
|
|
{ X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
|
|
|
|
{ X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
|
2015-01-23 06:39:59 +08:00
|
|
|
{ X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
|
2012-01-19 16:50:38 +08:00
|
|
|
{ X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
|
2015-01-23 06:39:59 +08:00
|
|
|
{ X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
|
|
|
|
{ X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
|
|
|
|
{ X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
|
2015-02-08 05:44:06 +08:00
|
|
|
{ X86::VPTESTYrr, X86::VPTESTYrm, 0 },
|
2014-10-25 16:11:20 +08:00
|
|
|
{ X86::VRCPPSYr, X86::VRCPPSYm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
|
|
|
|
{ X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
|
2014-10-25 16:11:20 +08:00
|
|
|
{ X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
|
|
|
|
{ X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
|
|
|
|
{ X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
|
|
|
|
{ X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
|
2012-07-15 20:26:30 +08:00
|
|
|
|
2011-11-14 16:07:55 +08:00
|
|
|
// AVX2 foldable instructions
|
2015-02-18 06:09:54 +08:00
|
|
|
|
|
|
|
// VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
|
|
|
|
// VBROADCASTS{SD}rm memory instructions were available from AVX1.
|
|
|
|
// TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
|
|
|
|
// on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
|
|
|
|
// so they don't need an equivalent limitation.
|
2015-02-09 01:13:54 +08:00
|
|
|
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VPABSBrr256, X86::VPABSBrm256, 0 },
|
|
|
|
{ X86::VPABSDrr256, X86::VPABSDrm256, 0 },
|
|
|
|
{ X86::VPABSWrr256, X86::VPABSWrm256, 0 },
|
2015-02-10 21:22:57 +08:00
|
|
|
{ X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 },
|
|
|
|
{ X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 },
|
|
|
|
{ X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 },
|
|
|
|
{ X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 },
|
|
|
|
{ X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 },
|
|
|
|
{ X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 },
|
|
|
|
{ X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 },
|
|
|
|
{ X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 },
|
|
|
|
{ X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
|
|
|
|
{ X86::VPERMQYri, X86::VPERMQYmi, 0 },
|
|
|
|
{ X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 },
|
|
|
|
{ X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 },
|
|
|
|
{ X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
|
|
|
|
{ X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
|
|
|
|
{ X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
|
|
|
|
{ X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 },
|
|
|
|
{ X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 },
|
|
|
|
{ X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 },
|
|
|
|
{ X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
|
|
|
|
{ X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
|
|
|
|
{ X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
|
|
|
|
{ X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
|
|
|
|
{ X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
|
|
|
|
{ X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
|
2012-09-26 16:24:51 +08:00
|
|
|
|
2015-02-10 20:57:17 +08:00
|
|
|
// XOP foldable instructions
|
|
|
|
{ X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
|
|
|
|
{ X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 },
|
|
|
|
{ X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
|
|
|
|
{ X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 },
|
|
|
|
{ X86::VFRCZSDrr, X86::VFRCZSDrm, 0 },
|
|
|
|
{ X86::VFRCZSSrr, X86::VFRCZSSrm, 0 },
|
|
|
|
{ X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
|
|
|
|
{ X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
|
|
|
|
{ X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
|
|
|
|
{ X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
|
|
|
|
{ X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
|
|
|
|
{ X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
|
|
|
|
{ X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
|
|
|
|
{ X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
|
|
|
|
{ X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
|
|
|
|
{ X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
|
|
|
|
{ X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
|
|
|
|
{ X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
|
|
|
|
{ X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
|
|
|
|
{ X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
|
|
|
|
{ X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
|
|
|
|
{ X86::VPROTBri, X86::VPROTBmi, 0 },
|
|
|
|
{ X86::VPROTBrr, X86::VPROTBmr, 0 },
|
|
|
|
{ X86::VPROTDri, X86::VPROTDmi, 0 },
|
|
|
|
{ X86::VPROTDrr, X86::VPROTDmr, 0 },
|
|
|
|
{ X86::VPROTQri, X86::VPROTQmi, 0 },
|
|
|
|
{ X86::VPROTQrr, X86::VPROTQmr, 0 },
|
|
|
|
{ X86::VPROTWri, X86::VPROTWmi, 0 },
|
|
|
|
{ X86::VPROTWrr, X86::VPROTWmr, 0 },
|
|
|
|
{ X86::VPSHABrr, X86::VPSHABmr, 0 },
|
|
|
|
{ X86::VPSHADrr, X86::VPSHADmr, 0 },
|
|
|
|
{ X86::VPSHAQrr, X86::VPSHAQmr, 0 },
|
|
|
|
{ X86::VPSHAWrr, X86::VPSHAWmr, 0 },
|
|
|
|
{ X86::VPSHLBrr, X86::VPSHLBmr, 0 },
|
|
|
|
{ X86::VPSHLDrr, X86::VPSHLDmr, 0 },
|
|
|
|
{ X86::VPSHLQrr, X86::VPSHLQmr, 0 },
|
|
|
|
{ X86::VPSHLWrr, X86::VPSHLWmr, 0 },
|
|
|
|
|
2013-10-06 04:20:51 +08:00
|
|
|
// BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
|
2012-12-17 13:02:29 +08:00
|
|
|
{ X86::BEXTR32rr, X86::BEXTR32rm, 0 },
|
|
|
|
{ X86::BEXTR64rr, X86::BEXTR64rm, 0 },
|
2013-10-06 04:20:51 +08:00
|
|
|
{ X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
|
|
|
|
{ X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
|
|
|
|
{ X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
|
|
|
|
{ X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
|
|
|
|
{ X86::BLCI32rr, X86::BLCI32rm, 0 },
|
|
|
|
{ X86::BLCI64rr, X86::BLCI64rm, 0 },
|
|
|
|
{ X86::BLCIC32rr, X86::BLCIC32rm, 0 },
|
|
|
|
{ X86::BLCIC64rr, X86::BLCIC64rm, 0 },
|
|
|
|
{ X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
|
|
|
|
{ X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
|
|
|
|
{ X86::BLCS32rr, X86::BLCS32rm, 0 },
|
|
|
|
{ X86::BLCS64rr, X86::BLCS64rm, 0 },
|
|
|
|
{ X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
|
|
|
|
{ X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
|
2012-12-17 13:02:29 +08:00
|
|
|
{ X86::BLSI32rr, X86::BLSI32rm, 0 },
|
|
|
|
{ X86::BLSI64rr, X86::BLSI64rm, 0 },
|
2013-10-06 04:20:51 +08:00
|
|
|
{ X86::BLSIC32rr, X86::BLSIC32rm, 0 },
|
|
|
|
{ X86::BLSIC64rr, X86::BLSIC64rm, 0 },
|
2012-12-17 13:02:29 +08:00
|
|
|
{ X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
|
|
|
|
{ X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
|
|
|
|
{ X86::BLSR32rr, X86::BLSR32rm, 0 },
|
|
|
|
{ X86::BLSR64rr, X86::BLSR64rm, 0 },
|
|
|
|
{ X86::BZHI32rr, X86::BZHI32rm, 0 },
|
|
|
|
{ X86::BZHI64rr, X86::BZHI64rm, 0 },
|
|
|
|
{ X86::LZCNT16rr, X86::LZCNT16rm, 0 },
|
|
|
|
{ X86::LZCNT32rr, X86::LZCNT32rm, 0 },
|
|
|
|
{ X86::LZCNT64rr, X86::LZCNT64rm, 0 },
|
|
|
|
{ X86::POPCNT16rr, X86::POPCNT16rm, 0 },
|
|
|
|
{ X86::POPCNT32rr, X86::POPCNT32rm, 0 },
|
|
|
|
{ X86::POPCNT64rr, X86::POPCNT64rm, 0 },
|
2012-09-26 16:24:51 +08:00
|
|
|
{ X86::RORX32ri, X86::RORX32mi, 0 },
|
|
|
|
{ X86::RORX64ri, X86::RORX64mi, 0 },
|
2012-09-26 16:26:25 +08:00
|
|
|
{ X86::SARX32rr, X86::SARX32rm, 0 },
|
|
|
|
{ X86::SARX64rr, X86::SARX64rm, 0 },
|
|
|
|
{ X86::SHRX32rr, X86::SHRX32rm, 0 },
|
|
|
|
{ X86::SHRX64rr, X86::SHRX64rm, 0 },
|
|
|
|
{ X86::SHLX32rr, X86::SHLX32rm, 0 },
|
|
|
|
{ X86::SHLX64rr, X86::SHLX64rm, 0 },
|
2013-10-06 04:20:51 +08:00
|
|
|
{ X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
|
|
|
|
{ X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
|
2012-12-17 13:02:29 +08:00
|
|
|
{ X86::TZCNT16rr, X86::TZCNT16rm, 0 },
|
|
|
|
{ X86::TZCNT32rr, X86::TZCNT32rm, 0 },
|
|
|
|
{ X86::TZCNT64rr, X86::TZCNT64rm, 0 },
|
2013-10-06 04:20:51 +08:00
|
|
|
{ X86::TZMSK32rr, X86::TZMSK32rm, 0 },
|
|
|
|
{ X86::TZMSK64rr, X86::TZMSK64rm, 0 },
|
2013-09-02 15:12:29 +08:00
|
|
|
|
|
|
|
// AVX-512 foldable instructions
|
|
|
|
{ X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
|
|
|
|
{ X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
|
2014-08-06 23:40:34 +08:00
|
|
|
{ X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
|
|
|
|
{ X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
|
[SKX] Enabling load/store instructions: encoding
Instructions: VMOVAPD, VMOVAPS, VMOVDQA8, VMOVDQA16, VMOVDQA32,VMOVDQA64, VMOVDQU8, VMOVDQU16, VMOVDQU32,VMOVDQU64, VMOVUPD, VMOVUPS,
Reviewed by Elena Demikhovsky <elena.demikhovsky@intel.com>
llvm-svn: 214719
2014-08-04 22:35:15 +08:00
|
|
|
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
|
|
|
|
{ X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
|
2014-09-26 17:48:50 +08:00
|
|
|
{ X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
|
|
|
|
{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
|
[SKX] Enabling load/store instructions: encoding
Instructions: VMOVAPD, VMOVAPS, VMOVDQA8, VMOVDQA16, VMOVDQA32,VMOVDQA64, VMOVDQU8, VMOVDQU16, VMOVDQU32,VMOVDQU64, VMOVUPD, VMOVUPS,
Reviewed by Elena Demikhovsky <elena.demikhovsky@intel.com>
llvm-svn: 214719
2014-08-04 22:35:15 +08:00
|
|
|
{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
|
|
|
|
{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
|
2014-08-06 23:40:34 +08:00
|
|
|
{ X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
|
|
|
|
{ X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
|
2014-03-27 17:45:08 +08:00
|
|
|
{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
|
|
|
|
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
|
2014-12-10 02:45:30 +08:00
|
|
|
{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2014-09-26 17:48:50 +08:00
|
|
|
// AVX-512 foldable instructions (256-bit versions)
|
|
|
|
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
|
|
|
|
{ X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
|
|
|
|
{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
|
|
|
|
{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
|
|
|
|
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
|
|
|
|
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
|
|
|
|
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
|
2014-12-10 02:45:30 +08:00
|
|
|
{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
|
2015-02-10 21:22:57 +08:00
|
|
|
|
2014-09-26 17:48:50 +08:00
|
|
|
// AVX-512 foldable instructions (256-bit versions)
|
|
|
|
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
|
|
|
|
{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
|
|
|
|
{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
|
|
|
|
{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
|
|
|
|
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
|
|
|
|
{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
|
2014-12-10 02:45:30 +08:00
|
|
|
{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
|
2015-02-10 20:57:17 +08:00
|
|
|
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
// F16C foldable instructions
|
|
|
|
{ X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
|
|
|
|
{ X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
|
2015-02-10 20:57:17 +08:00
|
|
|
|
2013-09-17 14:50:11 +08:00
|
|
|
// AES foldable instructions
|
|
|
|
{ X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
|
|
|
|
{ X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
|
2015-02-13 04:01:03 +08:00
|
|
|
{ X86::VAESIMCrr, X86::VAESIMCrm, 0 },
|
|
|
|
{ X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
|
2008-01-07 09:35:02 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) {
|
|
|
|
unsigned RegOp = MemoryFoldTable1[i].RegOp;
|
|
|
|
unsigned MemOp = MemoryFoldTable1[i].MemOp;
|
|
|
|
unsigned Flags = MemoryFoldTable1[i].Flags;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
|
|
|
|
RegOp, MemOp,
|
|
|
|
// Index 1, folded load
|
|
|
|
Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::ADC32rr, X86::ADC32rm, 0 },
|
|
|
|
{ X86::ADC64rr, X86::ADC64rm, 0 },
|
|
|
|
{ X86::ADD16rr, X86::ADD16rm, 0 },
|
|
|
|
{ X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD32rr, X86::ADD32rm, 0 },
|
|
|
|
{ X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD64rr, X86::ADD64rm, 0 },
|
|
|
|
{ X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
|
|
|
|
{ X86::ADD8rr, X86::ADD8rm, 0 },
|
|
|
|
{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::ADDSDrr, X86::ADDSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::ADDSSrr, X86::ADDSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::AND16rr, X86::AND16rm, 0 },
|
|
|
|
{ X86::AND32rr, X86::AND32rm, 0 },
|
|
|
|
{ X86::AND64rr, X86::AND64rm, 0 },
|
|
|
|
{ X86::AND8rr, X86::AND8rm, 0 },
|
|
|
|
{ X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
|
2012-01-15 02:14:53 +08:00
|
|
|
{ X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
|
|
|
|
{ X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
|
|
|
|
{ X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
|
|
|
|
{ X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::CMOVA16rr, X86::CMOVA16rm, 0 },
|
|
|
|
{ X86::CMOVA32rr, X86::CMOVA32rm, 0 },
|
|
|
|
{ X86::CMOVA64rr, X86::CMOVA64rm, 0 },
|
|
|
|
{ X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
|
|
|
|
{ X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
|
|
|
|
{ X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
|
|
|
|
{ X86::CMOVB16rr, X86::CMOVB16rm, 0 },
|
|
|
|
{ X86::CMOVB32rr, X86::CMOVB32rm, 0 },
|
|
|
|
{ X86::CMOVB64rr, X86::CMOVB64rm, 0 },
|
|
|
|
{ X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
|
|
|
|
{ X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
|
|
|
|
{ X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
|
|
|
|
{ X86::CMOVE16rr, X86::CMOVE16rm, 0 },
|
|
|
|
{ X86::CMOVE32rr, X86::CMOVE32rm, 0 },
|
|
|
|
{ X86::CMOVE64rr, X86::CMOVE64rm, 0 },
|
|
|
|
{ X86::CMOVG16rr, X86::CMOVG16rm, 0 },
|
|
|
|
{ X86::CMOVG32rr, X86::CMOVG32rm, 0 },
|
|
|
|
{ X86::CMOVG64rr, X86::CMOVG64rm, 0 },
|
|
|
|
{ X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
|
|
|
|
{ X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
|
|
|
|
{ X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
|
|
|
|
{ X86::CMOVL16rr, X86::CMOVL16rm, 0 },
|
|
|
|
{ X86::CMOVL32rr, X86::CMOVL32rm, 0 },
|
|
|
|
{ X86::CMOVL64rr, X86::CMOVL64rm, 0 },
|
|
|
|
{ X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
|
|
|
|
{ X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
|
|
|
|
{ X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
|
|
|
|
{ X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
|
|
|
|
{ X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
|
|
|
|
{ X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
|
|
|
|
{ X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
|
|
|
|
{ X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
|
|
|
|
{ X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
|
|
|
|
{ X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
|
|
|
|
{ X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
|
|
|
|
{ X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
|
|
|
|
{ X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
|
|
|
|
{ X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
|
|
|
|
{ X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
|
|
|
|
{ X86::CMOVO16rr, X86::CMOVO16rm, 0 },
|
|
|
|
{ X86::CMOVO32rr, X86::CMOVO32rm, 0 },
|
|
|
|
{ X86::CMOVO64rr, X86::CMOVO64rm, 0 },
|
|
|
|
{ X86::CMOVP16rr, X86::CMOVP16rm, 0 },
|
|
|
|
{ X86::CMOVP32rr, X86::CMOVP32rm, 0 },
|
|
|
|
{ X86::CMOVP64rr, X86::CMOVP64rm, 0 },
|
|
|
|
{ X86::CMOVS16rr, X86::CMOVS16rm, 0 },
|
|
|
|
{ X86::CMOVS32rr, X86::CMOVS32rm, 0 },
|
|
|
|
{ X86::CMOVS64rr, X86::CMOVS64rm, 0 },
|
|
|
|
{ X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
|
|
|
|
{ X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
|
|
|
|
{ X86::CMPSDrr, X86::CMPSDrm, 0 },
|
|
|
|
{ X86::CMPSSrr, X86::CMPSSrm, 0 },
|
2015-04-03 22:24:40 +08:00
|
|
|
{ X86::CRC32r32r32, X86::CRC32r32m32, 0 },
|
|
|
|
{ X86::CRC32r64r64, X86::CRC32r64m64, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::DIVSDrr, X86::DIVSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::DIVSSrr, X86::DIVSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 },
|
|
|
|
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
|
|
|
|
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
|
2015-02-18 04:08:21 +08:00
|
|
|
|
|
|
|
// FIXME: We should not be folding Fs* scalar loads into vector
|
|
|
|
// instructions because the vector instructions require vector-sized
|
|
|
|
// loads. Lowering should create vector-sized instructions (the Fv*
|
|
|
|
// variants below) to allow load folding.
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
|
2015-02-18 04:08:21 +08:00
|
|
|
|
|
|
|
{ X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::IMUL16rr, X86::IMUL16rm, 0 },
|
|
|
|
{ X86::IMUL32rr, X86::IMUL32rm, 0 },
|
|
|
|
{ X86::IMUL64rr, X86::IMUL64rm, 0 },
|
|
|
|
{ X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 },
|
|
|
|
{ X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 },
|
2012-08-14 02:29:41 +08:00
|
|
|
{ X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 },
|
|
|
|
{ X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
|
|
|
|
{ X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
|
|
|
|
{ X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
|
|
|
|
{ X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
|
|
|
|
{ X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MAXSDrr, X86::MAXSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::MAXSSrr, X86::MAXSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MINSDrr, X86::MINSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::MINSSrr, X86::MINSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::MULSDrr, X86::MULSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::MULSDrr_Int, X86::MULSDrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::MULSSrr, X86::MULSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::MULSSrr_Int, X86::MULSSrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::OR16rr, X86::OR16rm, 0 },
|
|
|
|
{ X86::OR32rr, X86::OR32rm, 0 },
|
|
|
|
{ X86::OR64rr, X86::OR64rm, 0 },
|
|
|
|
{ X86::OR8rr, X86::OR8rm, 0 },
|
|
|
|
{ X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
|
2012-01-15 02:14:53 +08:00
|
|
|
{ X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
|
2012-01-25 13:37:32 +08:00
|
|
|
{ X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
|
2012-01-25 13:37:32 +08:00
|
|
|
{ X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
|
2012-01-25 13:37:32 +08:00
|
|
|
{ X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PINSRBrr, X86::PINSRBrm, 0 },
|
|
|
|
{ X86::PINSRDrr, X86::PINSRDrm, 0 },
|
|
|
|
{ X86::PINSRQrr, X86::PINSRQrm, 0 },
|
|
|
|
{ X86::PINSRWrri, X86::PINSRWrmi, 0 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
|
2012-12-21 22:04:55 +08:00
|
|
|
{ X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
|
2011-11-14 16:07:55 +08:00
|
|
|
{ X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PORrr, X86::PORrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
|
2012-01-25 14:43:11 +08:00
|
|
|
{ X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
|
|
|
|
{ X86::SBB32rr, X86::SBB32rm, 0 },
|
|
|
|
{ X86::SBB64rr, X86::SBB64rm, 0 },
|
|
|
|
{ X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
|
|
|
|
{ X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
|
|
|
|
{ X86::SUB16rr, X86::SUB16rm, 0 },
|
|
|
|
{ X86::SUB32rr, X86::SUB32rm, 0 },
|
|
|
|
{ X86::SUB64rr, X86::SUB64rm, 0 },
|
|
|
|
{ X86::SUB8rr, X86::SUB8rm, 0 },
|
|
|
|
{ X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::SUBSDrr, X86::SUBSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 },
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::SUBSSrr, X86::SUBSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 },
|
2008-01-07 09:35:02 +08:00
|
|
|
// FIXME: TEST*rr -> swapped operand of TEST*mr.
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
|
|
|
|
{ X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
|
|
|
|
{ X86::XOR16rr, X86::XOR16rm, 0 },
|
|
|
|
{ X86::XOR32rr, X86::XOR32rm, 0 },
|
|
|
|
{ X86::XOR64rr, X86::XOR64rm, 0 },
|
|
|
|
{ X86::XOR8rr, X86::XOR8rm, 0 },
|
|
|
|
{ X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
|
2015-02-10 20:57:17 +08:00
|
|
|
|
2015-02-25 23:14:02 +08:00
|
|
|
// MMX version of foldable instructions
|
|
|
|
{ X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
|
|
|
|
{ X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
|
|
|
|
{ X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
|
|
|
|
{ X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
|
|
|
|
{ X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
|
|
|
|
{ X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
|
|
|
|
{ X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
|
|
|
|
{ X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
|
|
|
|
{ X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
|
|
|
|
{ X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
|
|
|
|
{ X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
|
|
|
|
{ X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
|
|
|
|
{ X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 },
|
|
|
|
{ X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
|
|
|
|
{ X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
|
|
|
|
{ X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
|
|
|
|
{ X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
|
|
|
|
{ X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
|
|
|
|
{ X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
|
|
|
|
{ X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
|
|
|
|
{ X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
|
|
|
|
{ X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
|
|
|
|
{ X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
|
|
|
|
{ X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 },
|
|
|
|
{ X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 },
|
|
|
|
{ X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 },
|
|
|
|
{ X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 },
|
|
|
|
{ X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 },
|
|
|
|
{ X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 },
|
|
|
|
{ X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 },
|
|
|
|
{ X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
|
|
|
|
{ X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
|
|
|
|
{ X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
|
|
|
|
{ X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
|
|
|
|
{ X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
|
|
|
|
{ X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
|
|
|
|
{ X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 },
|
|
|
|
{ X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
|
|
|
|
{ X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
|
|
|
|
{ X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
|
|
|
|
{ X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
|
|
|
|
{ X86::MMX_PORirr, X86::MMX_PORirm, 0 },
|
|
|
|
{ X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
|
|
|
|
{ X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 },
|
|
|
|
{ X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 },
|
|
|
|
{ X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 },
|
|
|
|
{ X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 },
|
|
|
|
{ X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
|
|
|
|
{ X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
|
|
|
|
{ X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
|
|
|
|
{ X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
|
|
|
|
{ X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
|
|
|
|
{ X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
|
|
|
|
{ X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
|
|
|
|
{ X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
|
|
|
|
{ X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
|
|
|
|
{ X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
|
|
|
|
{ X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
|
|
|
|
{ X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
|
|
|
|
{ X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
|
|
|
|
{ X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
|
|
|
|
{ X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
|
|
|
|
{ X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
|
|
|
|
{ X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
|
|
|
|
{ X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
|
|
|
|
{ X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
|
|
|
|
{ X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 },
|
|
|
|
{ X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 },
|
|
|
|
{ X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
|
|
|
|
{ X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
|
|
|
|
|
2015-04-03 19:50:30 +08:00
|
|
|
// 3DNow! version of foldable instructions
|
|
|
|
{ X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
|
|
|
|
{ X86::PFACCrr, X86::PFACCrm, 0 },
|
|
|
|
{ X86::PFADDrr, X86::PFADDrm, 0 },
|
|
|
|
{ X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
|
|
|
|
{ X86::PFCMPGErr, X86::PFCMPGErm, 0 },
|
|
|
|
{ X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
|
|
|
|
{ X86::PFMAXrr, X86::PFMAXrm, 0 },
|
|
|
|
{ X86::PFMINrr, X86::PFMINrm, 0 },
|
|
|
|
{ X86::PFMULrr, X86::PFMULrm, 0 },
|
|
|
|
{ X86::PFNACCrr, X86::PFNACCrm, 0 },
|
|
|
|
{ X86::PFPNACCrr, X86::PFPNACCrm, 0 },
|
|
|
|
{ X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
|
|
|
|
{ X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
|
|
|
|
{ X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
|
|
|
|
{ X86::PFSUBrr, X86::PFSUBrm, 0 },
|
|
|
|
{ X86::PFSUBRrr, X86::PFSUBRrm, 0 },
|
|
|
|
{ X86::PMULHRWrr, X86::PMULHRWrm, 0 },
|
|
|
|
|
2011-09-14 10:36:58 +08:00
|
|
|
// AVX 128-bit versions of foldable instructions
|
|
|
|
{ X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
|
|
|
|
{ X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 },
|
|
|
|
{ X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
|
|
|
|
{ X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
|
|
|
|
{ X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
|
|
|
|
{ X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
|
|
|
|
{ X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
|
|
|
|
{ X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
|
|
|
|
{ X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
|
|
|
|
{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
|
2012-12-26 08:35:47 +08:00
|
|
|
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
|
|
|
|
{ X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VRCPSSr, X86::VRCPSSm, 0 },
|
[x86] eliminate unnecessary shuffling/moves with unary scalar math ops (PR21507)
Finish the job that was abandoned in D6958 following the refactoring in
http://reviews.llvm.org/rL230221:
1. Uncomment the intrinsic def for the AVX r_Int instruction.
2. Add missing r_Int entries to the load folding tables; there are already
tests that check these in "test/Codegen/X86/fold-load-unops.ll", so I
haven't added any more in this patch.
3. Add patterns to solve PR21507 ( https://llvm.org/bugs/show_bug.cgi?id=21507 ).
So instead of this:
movaps %xmm0, %xmm1
rcpss %xmm1, %xmm1
movss %xmm1, %xmm0
We should now get:
rcpss %xmm0, %xmm0
And instead of this:
vsqrtss %xmm0, %xmm0, %xmm1
vblendps $1, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm1[0],xmm0[1,2,3]
We should now get:
vsqrtss %xmm0, %xmm0, %xmm0
Differential Revision: http://reviews.llvm.org/D9504
llvm-svn: 236740
2015-05-07 23:48:53 +08:00
|
|
|
{ X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
|
[x86] eliminate unnecessary shuffling/moves with unary scalar math ops (PR21507)
Finish the job that was abandoned in D6958 following the refactoring in
http://reviews.llvm.org/rL230221:
1. Uncomment the intrinsic def for the AVX r_Int instruction.
2. Add missing r_Int entries to the load folding tables; there are already
tests that check these in "test/Codegen/X86/fold-load-unops.ll", so I
haven't added any more in this patch.
3. Add patterns to solve PR21507 ( https://llvm.org/bugs/show_bug.cgi?id=21507 ).
So instead of this:
movaps %xmm0, %xmm1
rcpss %xmm1, %xmm1
movss %xmm1, %xmm0
We should now get:
rcpss %xmm0, %xmm0
And instead of this:
vsqrtss %xmm0, %xmm0, %xmm1
vblendps $1, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm1[0],xmm0[1,2,3]
We should now get:
vsqrtss %xmm0, %xmm0, %xmm0
Differential Revision: http://reviews.llvm.org/D9504
llvm-svn: 236740
2015-05-07 23:48:53 +08:00
|
|
|
{ X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
|
[x86] eliminate unnecessary shuffling/moves with unary scalar math ops (PR21507)
Finish the job that was abandoned in D6958 following the refactoring in
http://reviews.llvm.org/rL230221:
1. Uncomment the intrinsic def for the AVX r_Int instruction.
2. Add missing r_Int entries to the load folding tables; there are already
tests that check these in "test/Codegen/X86/fold-load-unops.ll", so I
haven't added any more in this patch.
3. Add patterns to solve PR21507 ( https://llvm.org/bugs/show_bug.cgi?id=21507 ).
So instead of this:
movaps %xmm0, %xmm1
rcpss %xmm1, %xmm1
movss %xmm1, %xmm0
We should now get:
rcpss %xmm0, %xmm0
And instead of this:
vsqrtss %xmm0, %xmm0, %xmm1
vblendps $1, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm1[0],xmm0[1,2,3]
We should now get:
vsqrtss %xmm0, %xmm0, %xmm0
Differential Revision: http://reviews.llvm.org/D9504
llvm-svn: 236740
2015-05-07 23:48:53 +08:00
|
|
|
{ X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VSQRTSSr, X86::VSQRTSSm, 0 },
|
[x86] eliminate unnecessary shuffling/moves with unary scalar math ops (PR21507)
Finish the job that was abandoned in D6958 following the refactoring in
http://reviews.llvm.org/rL230221:
1. Uncomment the intrinsic def for the AVX r_Int instruction.
2. Add missing r_Int entries to the load folding tables; there are already
tests that check these in "test/Codegen/X86/fold-load-unops.ll", so I
haven't added any more in this patch.
3. Add patterns to solve PR21507 ( https://llvm.org/bugs/show_bug.cgi?id=21507 ).
So instead of this:
movaps %xmm0, %xmm1
rcpss %xmm1, %xmm1
movss %xmm1, %xmm0
We should now get:
rcpss %xmm0, %xmm0
And instead of this:
vsqrtss %xmm0, %xmm0, %xmm1
vblendps $1, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm1[0],xmm0[1,2,3]
We should now get:
vsqrtss %xmm0, %xmm0, %xmm0
Differential Revision: http://reviews.llvm.org/D9504
llvm-svn: 236740
2015-05-07 23:48:53 +08:00
|
|
|
{ X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VADDPDrr, X86::VADDPDrm, 0 },
|
|
|
|
{ X86::VADDPSrr, X86::VADDPSrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VADDSSrr, X86::VADDSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
|
|
|
|
{ X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
|
|
|
|
{ X86::VANDNPDrr, X86::VANDNPDrm, 0 },
|
|
|
|
{ X86::VANDNPSrr, X86::VANDNPSrm, 0 },
|
|
|
|
{ X86::VANDPDrr, X86::VANDPDrm, 0 },
|
|
|
|
{ X86::VANDPSrr, X86::VANDPSrm, 0 },
|
|
|
|
{ X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
|
|
|
|
{ X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
|
|
|
|
{ X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
|
|
|
|
{ X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
|
|
|
|
{ X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
|
|
|
|
{ X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VCMPSDrr, X86::VCMPSDrm, 0 },
|
|
|
|
{ X86::VCMPSSrr, X86::VCMPSSrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VDIVPDrr, X86::VDIVPDrm, 0 },
|
|
|
|
{ X86::VDIVPSrr, X86::VDIVPSrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VDIVSSrr, X86::VDIVSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 },
|
|
|
|
{ X86::VDPPDrri, X86::VDPPDrmi, 0 },
|
|
|
|
{ X86::VDPPSrri, X86::VDPPSrmi, 0 },
|
2015-02-18 04:08:21 +08:00
|
|
|
// Do not fold VFs* loads because there are no scalar load variants for
|
|
|
|
// these instructions. When folded, the load is required to be 128-bits, so
|
|
|
|
// the load size would not match.
|
|
|
|
{ X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 },
|
|
|
|
{ X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 },
|
|
|
|
{ X86::VFvANDPDrr, X86::VFvANDPDrm, 0 },
|
|
|
|
{ X86::VFvANDPSrr, X86::VFvANDPSrm, 0 },
|
|
|
|
{ X86::VFvORPDrr, X86::VFvORPDrm, 0 },
|
|
|
|
{ X86::VFvORPSrr, X86::VFvORPSrm, 0 },
|
|
|
|
{ X86::VFvXORPDrr, X86::VFvXORPDrm, 0 },
|
|
|
|
{ X86::VFvXORPSrr, X86::VFvXORPSrm, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VHADDPDrr, X86::VHADDPDrm, 0 },
|
|
|
|
{ X86::VHADDPSrr, X86::VHADDPSrm, 0 },
|
|
|
|
{ X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
|
|
|
|
{ X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 },
|
|
|
|
{ X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VMAXPDrr, X86::VMAXPDrm, 0 },
|
|
|
|
{ X86::VMAXPSrr, X86::VMAXPSrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMAXSDrr, X86::VMAXSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VMINPDrr, X86::VMINPDrm, 0 },
|
|
|
|
{ X86::VMINPSrr, X86::VMINPSrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMINSDrr, X86::VMINSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMINSSrr, X86::VMINSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
|
2012-12-26 10:44:47 +08:00
|
|
|
{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
|
|
|
|
{ X86::VMULPDrr, X86::VMULPDrm, 0 },
|
|
|
|
{ X86::VMULPSrr, X86::VMULPSrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMULSDrr, X86::VMULSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VMULSSrr, X86::VMULSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VORPDrr, X86::VORPDrm, 0 },
|
|
|
|
{ X86::VORPSrr, X86::VORPSrm, 0 },
|
|
|
|
{ X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
|
|
|
|
{ X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
|
|
|
|
{ X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
|
|
|
|
{ X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
|
|
|
|
{ X86::VPADDBrr, X86::VPADDBrm, 0 },
|
|
|
|
{ X86::VPADDDrr, X86::VPADDDrm, 0 },
|
|
|
|
{ X86::VPADDQrr, X86::VPADDQrm, 0 },
|
|
|
|
{ X86::VPADDSBrr, X86::VPADDSBrm, 0 },
|
|
|
|
{ X86::VPADDSWrr, X86::VPADDSWrm, 0 },
|
|
|
|
{ X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
|
|
|
|
{ X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
|
|
|
|
{ X86::VPADDWrr, X86::VPADDWrm, 0 },
|
|
|
|
{ X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 },
|
|
|
|
{ X86::VPANDNrr, X86::VPANDNrm, 0 },
|
|
|
|
{ X86::VPANDrr, X86::VPANDrm, 0 },
|
|
|
|
{ X86::VPAVGBrr, X86::VPAVGBrm, 0 },
|
|
|
|
{ X86::VPAVGWrr, X86::VPAVGWrm, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
|
|
|
|
{ X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
|
|
|
|
{ X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
|
|
|
|
{ X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
|
|
|
|
{ X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
|
|
|
|
{ X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
|
|
|
|
{ X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
|
|
|
|
{ X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
|
|
|
|
{ X86::VPHADDDrr, X86::VPHADDDrm, 0 },
|
|
|
|
{ X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
|
|
|
|
{ X86::VPHADDWrr, X86::VPHADDWrm, 0 },
|
|
|
|
{ X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
|
|
|
|
{ X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
|
|
|
|
{ X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
|
|
|
|
{ X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
|
|
|
|
{ X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPINSRBrr, X86::VPINSRBrm, 0 },
|
|
|
|
{ X86::VPINSRDrr, X86::VPINSRDrm, 0 },
|
|
|
|
{ X86::VPINSRQrr, X86::VPINSRQrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
|
|
|
|
{ X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
|
|
|
|
{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
|
|
|
|
{ X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
|
|
|
|
{ X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
|
|
|
|
{ X86::VPMINSWrr, X86::VPMINSWrm, 0 },
|
|
|
|
{ X86::VPMINUBrr, X86::VPMINUBrm, 0 },
|
|
|
|
{ X86::VPMINSBrr, X86::VPMINSBrm, 0 },
|
|
|
|
{ X86::VPMINSDrr, X86::VPMINSDrm, 0 },
|
|
|
|
{ X86::VPMINUDrr, X86::VPMINUDrm, 0 },
|
|
|
|
{ X86::VPMINUWrr, X86::VPMINUWrm, 0 },
|
|
|
|
{ X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
|
|
|
|
{ X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
|
|
|
|
{ X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
|
|
|
|
{ X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
|
|
|
|
{ X86::VPMULDQrr, X86::VPMULDQrm, 0 },
|
|
|
|
{ X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 },
|
|
|
|
{ X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
|
|
|
|
{ X86::VPMULHWrr, X86::VPMULHWrm, 0 },
|
|
|
|
{ X86::VPMULLDrr, X86::VPMULLDrm, 0 },
|
|
|
|
{ X86::VPMULLWrr, X86::VPMULLWrm, 0 },
|
|
|
|
{ X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
|
|
|
|
{ X86::VPORrr, X86::VPORrm, 0 },
|
|
|
|
{ X86::VPSADBWrr, X86::VPSADBWrm, 0 },
|
|
|
|
{ X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
|
|
|
|
{ X86::VPSIGNBrr, X86::VPSIGNBrm, 0 },
|
|
|
|
{ X86::VPSIGNWrr, X86::VPSIGNWrm, 0 },
|
|
|
|
{ X86::VPSIGNDrr, X86::VPSIGNDrm, 0 },
|
|
|
|
{ X86::VPSLLDrr, X86::VPSLLDrm, 0 },
|
|
|
|
{ X86::VPSLLQrr, X86::VPSLLQrm, 0 },
|
|
|
|
{ X86::VPSLLWrr, X86::VPSLLWrm, 0 },
|
|
|
|
{ X86::VPSRADrr, X86::VPSRADrm, 0 },
|
|
|
|
{ X86::VPSRAWrr, X86::VPSRAWrm, 0 },
|
|
|
|
{ X86::VPSRLDrr, X86::VPSRLDrm, 0 },
|
|
|
|
{ X86::VPSRLQrr, X86::VPSRLQrm, 0 },
|
|
|
|
{ X86::VPSRLWrr, X86::VPSRLWrm, 0 },
|
|
|
|
{ X86::VPSUBBrr, X86::VPSUBBrm, 0 },
|
|
|
|
{ X86::VPSUBDrr, X86::VPSUBDrm, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPSUBQrr, X86::VPSUBQrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
|
|
|
|
{ X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
|
2015-01-22 07:43:30 +08:00
|
|
|
{ X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
|
|
|
|
{ X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPSUBWrr, X86::VPSUBWrm, 0 },
|
|
|
|
{ X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
|
|
|
|
{ X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
|
|
|
|
{ X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
|
|
|
|
{ X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
|
|
|
|
{ X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
|
|
|
|
{ X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
|
|
|
|
{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
|
|
|
|
{ X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
|
|
|
|
{ X86::VPXORrr, X86::VPXORrm, 0 },
|
|
|
|
{ X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
|
|
|
|
{ X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
|
|
|
|
{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
|
|
|
|
{ X86::VSUBPSrr, X86::VSUBPSrm, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VSUBSDrr, X86::VSUBSDrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 },
|
2011-09-14 10:36:58 +08:00
|
|
|
{ X86::VSUBSSrr, X86::VSUBSSrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
|
|
|
|
{ X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
|
|
|
|
{ X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
|
|
|
|
{ X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
|
|
|
|
{ X86::VXORPDrr, X86::VXORPDrm, 0 },
|
|
|
|
{ X86::VXORPSrr, X86::VXORPSrm, 0 },
|
2015-02-10 20:57:17 +08:00
|
|
|
|
2012-01-15 02:14:53 +08:00
|
|
|
// AVX 256-bit foldable instructions
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VADDPDYrr, X86::VADDPDYrm, 0 },
|
|
|
|
{ X86::VADDPSYrr, X86::VADDPSYrm, 0 },
|
|
|
|
{ X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
|
|
|
|
{ X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
|
|
|
|
{ X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
|
|
|
|
{ X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
|
|
|
|
{ X86::VANDPDYrr, X86::VANDPDYrm, 0 },
|
|
|
|
{ X86::VANDPSYrr, X86::VANDPSYrm, 0 },
|
|
|
|
{ X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
|
|
|
|
{ X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
|
|
|
|
{ X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
|
|
|
|
{ X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
|
|
|
|
{ X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
|
|
|
|
{ X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
|
|
|
|
{ X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
|
|
|
|
{ X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
|
[X86][AVX] Missing AVX1 memory folding float instructions
Now that we can create much more exhaustive X86 memory folding tests, this patch adds the missing AVX1/F16C floating point instruction stack foldings we can easily test for including the scalar intrinsics (add, div, max, min, mul, sub), conversions float/int to double, half precision conversions, rounding, dot product and bit test. The patch also adds a couple of obviously missing SSE instructions (more to follow once we have full SSE testing).
Now that scalar folding is working it broke a very old test (2006-10-07-ScalarSSEMiscompile.ll) - this test appears to make no sense as its trying to ensure that a scalar subtraction isn't folded as it 'would zero the top elts of the loaded vector' - this test just appears to be wrong to me.
Differential Revision: http://reviews.llvm.org/D7055
llvm-svn: 226513
2015-01-20 06:40:45 +08:00
|
|
|
{ X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
|
|
|
|
{ X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
|
|
|
|
{ X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
|
|
|
|
{ X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
|
|
|
|
{ X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
|
|
|
|
{ X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
|
|
|
|
{ X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
|
|
|
|
{ X86::VMINPDYrr, X86::VMINPDYrm, 0 },
|
|
|
|
{ X86::VMINPSYrr, X86::VMINPSYrm, 0 },
|
|
|
|
{ X86::VMULPDYrr, X86::VMULPDYrm, 0 },
|
|
|
|
{ X86::VMULPSYrr, X86::VMULPSYrm, 0 },
|
|
|
|
{ X86::VORPDYrr, X86::VORPDYrm, 0 },
|
|
|
|
{ X86::VORPSYrr, X86::VORPSYrm, 0 },
|
|
|
|
{ X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
|
|
|
|
{ X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
|
|
|
|
{ X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
|
|
|
|
{ X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
|
|
|
|
{ X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
|
|
|
|
{ X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
|
|
|
|
{ X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
|
|
|
|
{ X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
|
|
|
|
{ X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
|
|
|
|
{ X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
|
|
|
|
{ X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
|
|
|
|
{ X86::VXORPDYrr, X86::VXORPDYrm, 0 },
|
|
|
|
{ X86::VXORPSYrr, X86::VXORPSYrm, 0 },
|
2015-02-10 20:57:17 +08:00
|
|
|
|
2011-11-14 16:07:55 +08:00
|
|
|
// AVX2 foldable instructions
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
|
|
|
|
{ X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
|
|
|
|
{ X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
|
|
|
|
{ X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
|
|
|
|
{ X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
|
|
|
|
{ X86::VPADDBYrr, X86::VPADDBYrm, 0 },
|
|
|
|
{ X86::VPADDDYrr, X86::VPADDDYrm, 0 },
|
|
|
|
{ X86::VPADDQYrr, X86::VPADDQYrm, 0 },
|
|
|
|
{ X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
|
|
|
|
{ X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
|
|
|
|
{ X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
|
|
|
|
{ X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
|
|
|
|
{ X86::VPADDWYrr, X86::VPADDWYrm, 0 },
|
|
|
|
{ X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 },
|
|
|
|
{ X86::VPANDNYrr, X86::VPANDNYrm, 0 },
|
|
|
|
{ X86::VPANDYrr, X86::VPANDYrm, 0 },
|
|
|
|
{ X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
|
|
|
|
{ X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
|
|
|
|
{ X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
|
|
|
|
{ X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
|
2015-02-10 21:22:57 +08:00
|
|
|
{ X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
|
|
|
|
{ X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
|
|
|
|
{ X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
|
|
|
|
{ X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
|
|
|
|
{ X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
|
|
|
|
{ X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
|
|
|
|
{ X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
|
|
|
|
{ X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
|
|
|
|
{ X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
|
|
|
|
{ X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
|
|
|
|
{ X86::VPERMDYrr, X86::VPERMDYrm, 0 },
|
|
|
|
{ X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
|
|
|
|
{ X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
|
|
|
|
{ X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
|
|
|
|
{ X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
|
|
|
|
{ X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
|
|
|
|
{ X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
|
|
|
|
{ X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
|
|
|
|
{ X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 },
|
|
|
|
{ X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
|
|
|
|
{ X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
|
|
|
|
{ X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
|
|
|
|
{ X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
|
|
|
|
{ X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
|
|
|
|
{ X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
|
|
|
|
{ X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
|
|
|
|
{ X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
|
|
|
|
{ X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
|
|
|
|
{ X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
|
|
|
|
{ X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
|
|
|
|
{ X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
|
|
|
|
{ X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
|
|
|
|
{ X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
|
|
|
|
{ X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
|
|
|
|
{ X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 },
|
|
|
|
{ X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
|
|
|
|
{ X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
|
|
|
|
{ X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
|
|
|
|
{ X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
|
|
|
|
{ X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
|
|
|
|
{ X86::VPORYrr, X86::VPORYrm, 0 },
|
|
|
|
{ X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
|
|
|
|
{ X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
|
|
|
|
{ X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 },
|
|
|
|
{ X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 },
|
|
|
|
{ X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 },
|
|
|
|
{ X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
|
|
|
|
{ X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
|
|
|
|
{ X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
|
|
|
|
{ X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
|
|
|
|
{ X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
|
|
|
|
{ X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
|
|
|
|
{ X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
|
|
|
|
{ X86::VPSRADYrr, X86::VPSRADYrm, 0 },
|
|
|
|
{ X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
|
|
|
|
{ X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
|
|
|
|
{ X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
|
|
|
|
{ X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
|
|
|
|
{ X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
|
|
|
|
{ X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
|
|
|
|
{ X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
|
|
|
|
{ X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
|
|
|
|
{ X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
|
|
|
|
{ X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
|
|
|
|
{ X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
|
|
|
|
{ X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
|
2015-02-10 21:22:57 +08:00
|
|
|
{ X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
|
|
|
|
{ X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
|
2015-02-10 21:22:57 +08:00
|
|
|
{ X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
|
|
|
|
{ X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
|
2012-12-24 17:40:33 +08:00
|
|
|
{ X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
|
|
|
|
{ X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
|
|
|
|
{ X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
|
|
|
|
{ X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
|
|
|
|
{ X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
|
|
|
|
{ X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
|
|
|
|
{ X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
|
|
|
|
{ X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
|
|
|
|
{ X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
|
|
|
|
{ X86::VPXORYrr, X86::VPXORYrm, 0 },
|
2012-09-01 07:10:34 +08:00
|
|
|
|
|
|
|
// FMA4 foldable patterns
|
2015-02-10 13:10:50 +08:00
|
|
|
{ X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 },
|
|
|
|
{ X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 },
|
|
|
|
{ X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 },
|
|
|
|
{ X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 },
|
|
|
|
{ X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, 0 },
|
|
|
|
{ X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, 0 },
|
|
|
|
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 },
|
|
|
|
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 },
|
|
|
|
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 },
|
|
|
|
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 },
|
|
|
|
{ X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, 0 },
|
|
|
|
{ X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, 0 },
|
|
|
|
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 },
|
|
|
|
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 },
|
|
|
|
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 },
|
|
|
|
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 },
|
|
|
|
{ X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, 0 },
|
|
|
|
{ X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, 0 },
|
|
|
|
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 },
|
|
|
|
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 },
|
|
|
|
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 },
|
|
|
|
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 },
|
|
|
|
{ X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, 0 },
|
|
|
|
{ X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, 0 },
|
|
|
|
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 },
|
|
|
|
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 },
|
|
|
|
{ X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, 0 },
|
|
|
|
{ X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, 0 },
|
|
|
|
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 },
|
|
|
|
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 },
|
|
|
|
{ X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, 0 },
|
|
|
|
{ X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, 0 },
|
2012-09-26 16:22:37 +08:00
|
|
|
|
2015-02-10 20:57:17 +08:00
|
|
|
// XOP foldable instructions
|
|
|
|
{ X86::VPCMOVrr, X86::VPCMOVmr, 0 },
|
|
|
|
{ X86::VPCMOVrrY, X86::VPCMOVmrY, 0 },
|
|
|
|
{ X86::VPCOMBri, X86::VPCOMBmi, 0 },
|
|
|
|
{ X86::VPCOMDri, X86::VPCOMDmi, 0 },
|
|
|
|
{ X86::VPCOMQri, X86::VPCOMQmi, 0 },
|
|
|
|
{ X86::VPCOMWri, X86::VPCOMWmi, 0 },
|
|
|
|
{ X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
|
|
|
|
{ X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
|
|
|
|
{ X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
|
|
|
|
{ X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
|
|
|
|
{ X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
|
|
|
|
{ X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 },
|
|
|
|
{ X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
|
|
|
|
{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 },
|
|
|
|
{ X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
|
|
|
|
{ X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
|
|
|
|
{ X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
|
|
|
|
{ X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
|
|
|
|
{ X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
|
|
|
|
{ X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
|
|
|
|
{ X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
|
|
|
|
{ X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
|
|
|
|
{ X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
|
|
|
|
{ X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
|
|
|
|
{ X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
|
|
|
|
{ X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
|
|
|
|
{ X86::VPPERMrr, X86::VPPERMmr, 0 },
|
|
|
|
{ X86::VPROTBrr, X86::VPROTBrm, 0 },
|
|
|
|
{ X86::VPROTDrr, X86::VPROTDrm, 0 },
|
|
|
|
{ X86::VPROTQrr, X86::VPROTQrm, 0 },
|
|
|
|
{ X86::VPROTWrr, X86::VPROTWrm, 0 },
|
|
|
|
{ X86::VPSHABrr, X86::VPSHABrm, 0 },
|
|
|
|
{ X86::VPSHADrr, X86::VPSHADrm, 0 },
|
|
|
|
{ X86::VPSHAQrr, X86::VPSHAQrm, 0 },
|
|
|
|
{ X86::VPSHAWrr, X86::VPSHAWrm, 0 },
|
|
|
|
{ X86::VPSHLBrr, X86::VPSHLBrm, 0 },
|
|
|
|
{ X86::VPSHLDrr, X86::VPSHLDrm, 0 },
|
|
|
|
{ X86::VPSHLQrr, X86::VPSHLQrm, 0 },
|
|
|
|
{ X86::VPSHLWrr, X86::VPSHLWrm, 0 },
|
|
|
|
|
2012-09-26 16:22:37 +08:00
|
|
|
// BMI/BMI2 foldable instructions
|
2012-12-17 13:02:29 +08:00
|
|
|
{ X86::ANDN32rr, X86::ANDN32rm, 0 },
|
|
|
|
{ X86::ANDN64rr, X86::ANDN64rm, 0 },
|
2012-09-26 16:22:37 +08:00
|
|
|
{ X86::MULX32rr, X86::MULX32rm, 0 },
|
|
|
|
{ X86::MULX64rr, X86::MULX64rm, 0 },
|
2012-12-17 13:02:29 +08:00
|
|
|
{ X86::PDEP32rr, X86::PDEP32rm, 0 },
|
|
|
|
{ X86::PDEP64rr, X86::PDEP64rm, 0 },
|
|
|
|
{ X86::PEXT32rr, X86::PEXT32rm, 0 },
|
|
|
|
{ X86::PEXT64rr, X86::PEXT64rm, 0 },
|
2013-08-11 15:55:09 +08:00
|
|
|
|
|
|
|
// AVX-512 foldable instructions
|
2013-09-02 15:12:29 +08:00
|
|
|
{ X86::VADDPSZrr, X86::VADDPSZrm, 0 },
|
|
|
|
{ X86::VADDPDZrr, X86::VADDPDZrm, 0 },
|
|
|
|
{ X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
|
|
|
|
{ X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
|
|
|
|
{ X86::VMULPSZrr, X86::VMULPSZrm, 0 },
|
|
|
|
{ X86::VMULPDZrr, X86::VMULPDZrm, 0 },
|
|
|
|
{ X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
|
|
|
|
{ X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
|
|
|
|
{ X86::VMINPSZrr, X86::VMINPSZrm, 0 },
|
|
|
|
{ X86::VMINPDZrr, X86::VMINPDZrm, 0 },
|
|
|
|
{ X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
|
|
|
|
{ X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
|
2014-03-27 17:45:08 +08:00
|
|
|
{ X86::VPADDDZrr, X86::VPADDDZrm, 0 },
|
|
|
|
{ X86::VPADDQZrr, X86::VPADDQZrm, 0 },
|
2013-08-11 15:55:09 +08:00
|
|
|
{ X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
|
|
|
|
{ X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
|
2014-03-27 17:45:08 +08:00
|
|
|
{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
|
|
|
|
{ X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
|
|
|
|
{ X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
|
|
|
|
{ X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
|
|
|
|
{ X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
|
|
|
|
{ X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
|
|
|
|
{ X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
|
|
|
|
{ X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
|
|
|
|
{ X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
|
2013-09-02 15:12:29 +08:00
|
|
|
{ X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
|
|
|
|
{ X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
|
|
|
|
{ X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
|
|
|
|
{ X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
|
|
|
|
{ X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
|
2014-03-27 17:45:08 +08:00
|
|
|
{ X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
|
|
|
|
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
|
2013-09-02 15:12:29 +08:00
|
|
|
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
|
|
|
|
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
|
|
|
|
{ X86::VALIGNQrri, X86::VALIGNQrmi, 0 },
|
|
|
|
{ X86::VALIGNDrri, X86::VALIGNDrmi, 0 },
|
2014-03-27 17:45:08 +08:00
|
|
|
{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
|
2014-12-10 02:45:30 +08:00
|
|
|
{ X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
|
|
|
|
|
|
|
|
// AVX-512{F,VL} foldable instructions
|
|
|
|
{ X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
|
2013-09-17 14:50:11 +08:00
|
|
|
|
2014-12-18 20:28:22 +08:00
|
|
|
// AVX-512{F,VL} foldable instructions
|
|
|
|
{ X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
|
|
|
|
{ X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
|
|
|
|
{ X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
|
|
|
|
{ X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
|
|
|
|
|
2013-09-17 14:50:11 +08:00
|
|
|
// AES foldable instructions
|
|
|
|
{ X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
|
|
|
|
{ X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
|
|
|
|
{ X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
|
|
|
|
{ X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
|
2015-02-10 13:10:50 +08:00
|
|
|
{ X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
|
|
|
|
{ X86::VAESDECrr, X86::VAESDECrm, 0 },
|
|
|
|
{ X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
|
|
|
|
{ X86::VAESENCrr, X86::VAESENCrm, 0 },
|
2013-09-17 14:50:11 +08:00
|
|
|
|
|
|
|
// SHA foldable instructions
|
|
|
|
{ X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
|
|
|
|
{ X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
|
|
|
|
{ X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
|
|
|
|
{ X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
|
|
|
|
{ X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
|
|
|
|
{ X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
|
2015-02-10 20:57:17 +08:00
|
|
|
{ X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
|
2008-01-07 09:35:02 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) {
|
|
|
|
unsigned RegOp = MemoryFoldTable2[i].RegOp;
|
|
|
|
unsigned MemOp = MemoryFoldTable2[i].MemOp;
|
|
|
|
unsigned Flags = MemoryFoldTable2[i].Flags;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
|
|
|
|
RegOp, MemOp,
|
|
|
|
// Index 2, folded load
|
|
|
|
Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
|
|
|
|
}
|
2012-05-31 17:20:20 +08:00
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
|
2012-05-31 17:20:20 +08:00
|
|
|
// FMA foldable instructions
|
2014-04-03 06:06:16 +08:00
|
|
|
{ X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE },
|
|
|
|
|
|
|
|
{ X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE },
|
|
|
|
{ X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE },
|
2012-09-01 07:10:34 +08:00
|
|
|
|
|
|
|
// FMA4 foldable patterns
|
2012-11-04 12:40:08 +08:00
|
|
|
{ X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 },
|
|
|
|
{ X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 },
|
2012-09-01 07:10:34 +08:00
|
|
|
{ X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 },
|
2012-11-04 12:40:08 +08:00
|
|
|
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 },
|
|
|
|
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 },
|
2012-09-01 07:10:34 +08:00
|
|
|
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 },
|
2012-11-04 12:40:08 +08:00
|
|
|
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 },
|
|
|
|
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 },
|
2012-09-01 07:10:34 +08:00
|
|
|
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 },
|
2012-11-04 12:40:08 +08:00
|
|
|
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 },
|
|
|
|
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 },
|
2012-09-01 07:10:34 +08:00
|
|
|
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 },
|
|
|
|
{ X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 },
|
|
|
|
{ X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 },
|
2015-02-10 20:57:17 +08:00
|
|
|
|
|
|
|
// XOP foldable instructions
|
|
|
|
{ X86::VPCMOVrr, X86::VPCMOVrm, 0 },
|
|
|
|
{ X86::VPCMOVrrY, X86::VPCMOVrmY, 0 },
|
|
|
|
{ X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
|
|
|
|
{ X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 },
|
|
|
|
{ X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
|
|
|
|
{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 },
|
|
|
|
{ X86::VPPERMrr, X86::VPPERMrm, 0 },
|
|
|
|
|
2013-10-06 21:11:09 +08:00
|
|
|
// AVX-512 VPERMI instructions with 3 source operands.
|
|
|
|
{ X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
|
|
|
|
{ X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
|
|
|
|
{ X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
|
|
|
|
{ X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
|
2014-01-08 18:54:22 +08:00
|
|
|
{ X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
|
|
|
|
{ X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
|
|
|
|
{ X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
|
2014-12-10 02:45:30 +08:00
|
|
|
{ X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 },
|
|
|
|
{ X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
|
|
|
|
{ X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
|
2014-12-18 20:28:22 +08:00
|
|
|
{ X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
|
|
|
|
// AVX-512 arithmetic instructions
|
|
|
|
{ X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
|
|
|
|
{ X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
|
|
|
|
{ X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
|
|
|
|
{ X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
|
|
|
|
{ X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
|
|
|
|
{ X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
|
|
|
|
{ X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
|
|
|
|
{ X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
|
|
|
|
{ X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
|
|
|
|
{ X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
|
|
|
|
{ X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
|
|
|
|
{ X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
|
|
|
|
// AVX-512{F,VL} arithmetic instructions 256-bit
|
|
|
|
{ X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
|
|
|
|
{ X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
|
|
|
|
{ X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
|
|
|
|
{ X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
|
|
|
|
{ X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
|
|
|
|
{ X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
|
|
|
|
{ X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
|
|
|
|
{ X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
|
|
|
|
{ X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
|
|
|
|
{ X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
|
|
|
|
{ X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
|
|
|
|
{ X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
|
|
|
|
// AVX-512{F,VL} arithmetic instructions 128-bit
|
|
|
|
{ X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
|
|
|
|
{ X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
|
|
|
|
{ X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
|
|
|
|
{ X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
|
|
|
|
{ X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
|
|
|
|
{ X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
|
|
|
|
{ X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
|
|
|
|
{ X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
|
|
|
|
{ X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
|
|
|
|
{ X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
|
|
|
|
{ X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
|
|
|
|
{ X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }
|
2012-05-31 17:20:20 +08:00
|
|
|
};
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) {
|
|
|
|
unsigned RegOp = MemoryFoldTable3[i].RegOp;
|
|
|
|
unsigned MemOp = MemoryFoldTable3[i].MemOp;
|
|
|
|
unsigned Flags = MemoryFoldTable3[i].Flags;
|
2012-05-31 17:20:20 +08:00
|
|
|
AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
|
|
|
|
RegOp, MemOp,
|
|
|
|
// Index 3, folded load
|
|
|
|
Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
|
|
|
|
}
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
|
2014-12-18 20:28:22 +08:00
|
|
|
// AVX-512 foldable instructions
|
|
|
|
{ X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
|
|
|
|
{ X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
|
|
|
|
{ X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
|
|
|
|
{ X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
|
|
|
|
{ X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
|
|
|
|
{ X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
|
|
|
|
{ X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
|
|
|
|
{ X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
|
|
|
|
{ X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
|
|
|
|
{ X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
|
|
|
|
{ X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
|
|
|
|
{ X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
|
|
|
|
// AVX-512{F,VL} foldable instructions 256-bit
|
|
|
|
{ X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
|
|
|
|
{ X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
|
|
|
|
{ X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
|
|
|
|
{ X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
|
|
|
|
{ X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
|
|
|
|
{ X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
|
|
|
|
{ X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
|
|
|
|
{ X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
|
|
|
|
{ X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
|
|
|
|
{ X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
|
|
|
|
{ X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
|
|
|
|
{ X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
|
|
|
|
// AVX-512{F,VL} foldable instructions 128-bit
|
|
|
|
{ X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
|
|
|
|
{ X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
|
|
|
|
{ X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
|
|
|
|
{ X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
|
|
|
|
{ X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
|
|
|
|
{ X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
|
|
|
|
{ X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
|
|
|
|
{ X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
|
|
|
|
{ X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
|
|
|
|
{ X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
|
|
|
|
{ X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
|
|
|
|
{ X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }
|
|
|
|
};
|
|
|
|
|
2015-02-18 06:38:06 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) {
|
|
|
|
unsigned RegOp = MemoryFoldTable4[i].RegOp;
|
|
|
|
unsigned MemOp = MemoryFoldTable4[i].MemOp;
|
|
|
|
unsigned Flags = MemoryFoldTable4[i].Flags;
|
2014-12-18 20:28:22 +08:00
|
|
|
AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
|
|
|
|
RegOp, MemOp,
|
|
|
|
// Index 4, folded load
|
|
|
|
Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
|
|
|
|
}
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
void
|
|
|
|
X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
|
|
|
|
MemOp2RegOpTableType &M2RTable,
|
|
|
|
unsigned RegOp, unsigned MemOp, unsigned Flags) {
|
|
|
|
if ((Flags & TB_NO_FORWARD) == 0) {
|
|
|
|
assert(!R2MTable.count(RegOp) && "Duplicate entry!");
|
|
|
|
R2MTable[RegOp] = std::make_pair(MemOp, Flags);
|
|
|
|
}
|
|
|
|
if ((Flags & TB_NO_REVERSE) == 0) {
|
|
|
|
assert(!M2RTable.count(MemOp) &&
|
2010-10-08 11:54:52 +08:00
|
|
|
"Duplicated entries in unfolding maps?");
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
M2RTable[MemOp] = std::make_pair(RegOp, Flags);
|
|
|
|
}
|
2002-10-26 06:55:53 +08:00
|
|
|
}
|
|
|
|
|
2010-01-12 08:09:37 +08:00
|
|
|
bool
|
2010-01-13 08:30:23 +08:00
|
|
|
X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
|
|
|
|
unsigned &SrcReg, unsigned &DstReg,
|
|
|
|
unsigned &SubIdx) const {
|
2010-01-12 08:09:37 +08:00
|
|
|
switch (MI.getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case X86::MOVSX16rr8:
|
|
|
|
case X86::MOVZX16rr8:
|
|
|
|
case X86::MOVSX32rr8:
|
|
|
|
case X86::MOVZX32rr8:
|
|
|
|
case X86::MOVSX64rr8:
|
2014-06-11 06:34:31 +08:00
|
|
|
if (!Subtarget.is64Bit())
|
2010-01-13 16:01:32 +08:00
|
|
|
// It's not always legal to reference the low 8-bit of the larger
|
|
|
|
// register in 32-bit mode.
|
|
|
|
return false;
|
2010-01-12 08:09:37 +08:00
|
|
|
case X86::MOVSX32rr16:
|
|
|
|
case X86::MOVZX32rr16:
|
|
|
|
case X86::MOVSX64rr16:
|
2013-05-30 18:43:18 +08:00
|
|
|
case X86::MOVSX64rr32: {
|
2010-01-12 08:09:37 +08:00
|
|
|
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
|
|
|
|
// Be conservative.
|
|
|
|
return false;
|
|
|
|
SrcReg = MI.getOperand(1).getReg();
|
|
|
|
DstReg = MI.getOperand(0).getReg();
|
|
|
|
switch (MI.getOpcode()) {
|
2012-08-21 16:16:16 +08:00
|
|
|
default: llvm_unreachable("Unreachable!");
|
2010-01-12 08:09:37 +08:00
|
|
|
case X86::MOVSX16rr8:
|
|
|
|
case X86::MOVZX16rr8:
|
|
|
|
case X86::MOVSX32rr8:
|
|
|
|
case X86::MOVZX32rr8:
|
|
|
|
case X86::MOVSX64rr8:
|
2010-05-26 01:04:16 +08:00
|
|
|
SubIdx = X86::sub_8bit;
|
2010-01-12 08:09:37 +08:00
|
|
|
break;
|
|
|
|
case X86::MOVSX32rr16:
|
|
|
|
case X86::MOVZX32rr16:
|
|
|
|
case X86::MOVSX64rr16:
|
2010-05-26 01:04:16 +08:00
|
|
|
SubIdx = X86::sub_16bit;
|
2010-01-12 08:09:37 +08:00
|
|
|
break;
|
|
|
|
case X86::MOVSX64rr32:
|
2010-05-26 01:04:16 +08:00
|
|
|
SubIdx = X86::sub_32bit;
|
2010-01-12 08:09:37 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-01-13 08:30:23 +08:00
|
|
|
return true;
|
2010-01-12 08:09:37 +08:00
|
|
|
}
|
|
|
|
}
|
2010-01-13 08:30:23 +08:00
|
|
|
return false;
|
2010-01-12 08:09:37 +08:00
|
|
|
}
|
|
|
|
|
2015-02-02 00:56:04 +08:00
|
|
|
int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
|
|
|
|
const MachineFunction *MF = MI->getParent()->getParent();
|
|
|
|
const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
|
|
|
|
|
|
|
|
if (MI->getOpcode() == getCallFrameSetupOpcode() ||
|
|
|
|
MI->getOpcode() == getCallFrameDestroyOpcode()) {
|
|
|
|
unsigned StackAlign = TFI->getStackAlignment();
|
2015-02-10 20:57:17 +08:00
|
|
|
int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
|
2015-02-02 00:56:04 +08:00
|
|
|
StackAlign;
|
|
|
|
|
|
|
|
SPAdj -= MI->getOperand(1).getImm();
|
|
|
|
|
|
|
|
if (MI->getOpcode() == getCallFrameSetupOpcode())
|
|
|
|
return SPAdj;
|
|
|
|
else
|
|
|
|
return -SPAdj;
|
|
|
|
}
|
2015-02-10 20:57:17 +08:00
|
|
|
|
|
|
|
// To know whether a call adjusts the stack, we need information
|
2015-02-02 00:56:04 +08:00
|
|
|
// that is bound to the following ADJCALLSTACKUP pseudo.
|
|
|
|
// Look for the next ADJCALLSTACKUP that follows the call.
|
|
|
|
if (MI->isCall()) {
|
|
|
|
const MachineBasicBlock* MBB = MI->getParent();
|
|
|
|
auto I = ++MachineBasicBlock::const_iterator(MI);
|
|
|
|
for (auto E = MBB->end(); I != E; ++I) {
|
|
|
|
if (I->getOpcode() == getCallFrameDestroyOpcode() ||
|
|
|
|
I->isCall())
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we could not find a frame destroy opcode, then it has already
|
|
|
|
// been simplified, so we don't care.
|
|
|
|
if (I->getOpcode() != getCallFrameDestroyOpcode())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return -(I->getOperand(1).getImm());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Currently handle only PUSHes we can reasonably expect to see
|
|
|
|
// in call sequences
|
|
|
|
switch (MI->getOpcode()) {
|
2015-02-10 20:57:17 +08:00
|
|
|
default:
|
2015-02-02 00:56:04 +08:00
|
|
|
return 0;
|
|
|
|
case X86::PUSH32i8:
|
|
|
|
case X86::PUSH32r:
|
|
|
|
case X86::PUSH32rmm:
|
|
|
|
case X86::PUSH32rmr:
|
|
|
|
case X86::PUSHi32:
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return true and the FrameIndex if the specified
|
2009-11-13 04:55:29 +08:00
|
|
|
/// operand and follow operands form a reference to the stack frame.
|
|
|
|
bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
|
|
|
|
int &FrameIndex) const {
|
2014-05-06 15:04:32 +08:00
|
|
|
if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
|
|
|
|
MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
|
|
|
|
MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
|
|
|
|
MI->getOperand(Op+X86::AddrDisp).isImm() &&
|
|
|
|
MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
|
|
|
|
MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
|
|
|
|
MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
|
|
|
|
FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
|
2009-11-13 04:55:29 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-11-13 08:29:53 +08:00
|
|
|
static bool isFrameLoadOpcode(int Opcode) {
|
|
|
|
switch (Opcode) {
|
2012-01-21 05:51:11 +08:00
|
|
|
default:
|
|
|
|
return false;
|
2006-02-03 04:12:32 +08:00
|
|
|
case X86::MOV8rm:
|
|
|
|
case X86::MOV16rm:
|
|
|
|
case X86::MOV32rm:
|
2006-09-08 14:48:29 +08:00
|
|
|
case X86::MOV64rm:
|
2007-07-05 05:07:47 +08:00
|
|
|
case X86::LD_Fp64m:
|
2006-02-03 04:12:32 +08:00
|
|
|
case X86::MOVSSrm:
|
|
|
|
case X86::MOVSDrm:
|
2006-04-19 00:44:51 +08:00
|
|
|
case X86::MOVAPSrm:
|
|
|
|
case X86::MOVAPDrm:
|
2009-01-09 10:40:34 +08:00
|
|
|
case X86::MOVDQArm:
|
2011-09-14 10:36:58 +08:00
|
|
|
case X86::VMOVSSrm:
|
|
|
|
case X86::VMOVSDrm:
|
|
|
|
case X86::VMOVAPSrm:
|
|
|
|
case X86::VMOVAPDrm:
|
|
|
|
case X86::VMOVDQArm:
|
2014-11-19 07:38:19 +08:00
|
|
|
case X86::VMOVUPSYrm:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVAPSYrm:
|
2014-11-19 07:38:19 +08:00
|
|
|
case X86::VMOVUPDYrm:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVAPDYrm:
|
2014-11-19 07:38:19 +08:00
|
|
|
case X86::VMOVDQUYrm:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVDQAYrm:
|
2007-04-03 14:00:37 +08:00
|
|
|
case X86::MMX_MOVD64rm:
|
|
|
|
case X86::MMX_MOVQ64rm:
|
2014-01-23 22:27:26 +08:00
|
|
|
case X86::VMOVAPSZrm:
|
|
|
|
case X86::VMOVUPSZrm:
|
2009-11-13 08:29:53 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool isFrameStoreOpcode(int Opcode) {
|
|
|
|
switch (Opcode) {
|
|
|
|
default: break;
|
|
|
|
case X86::MOV8mr:
|
|
|
|
case X86::MOV16mr:
|
|
|
|
case X86::MOV32mr:
|
|
|
|
case X86::MOV64mr:
|
|
|
|
case X86::ST_FpP64m:
|
|
|
|
case X86::MOVSSmr:
|
|
|
|
case X86::MOVSDmr:
|
|
|
|
case X86::MOVAPSmr:
|
|
|
|
case X86::MOVAPDmr:
|
|
|
|
case X86::MOVDQAmr:
|
2011-09-14 10:36:58 +08:00
|
|
|
case X86::VMOVSSmr:
|
|
|
|
case X86::VMOVSDmr:
|
|
|
|
case X86::VMOVAPSmr:
|
|
|
|
case X86::VMOVAPDmr:
|
|
|
|
case X86::VMOVDQAmr:
|
2014-11-19 07:38:19 +08:00
|
|
|
case X86::VMOVUPSYmr:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVAPSYmr:
|
2014-11-19 07:38:19 +08:00
|
|
|
case X86::VMOVUPDYmr:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVAPDYmr:
|
2014-11-19 07:38:19 +08:00
|
|
|
case X86::VMOVDQUYmr:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVDQAYmr:
|
2014-01-23 22:27:26 +08:00
|
|
|
case X86::VMOVUPSZmr:
|
|
|
|
case X86::VMOVAPSZmr:
|
2009-11-13 08:29:53 +08:00
|
|
|
case X86::MMX_MOVD64mr:
|
|
|
|
case X86::MMX_MOVQ64mr:
|
|
|
|
case X86::MMX_MOVNTQmr:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
|
2009-11-13 08:29:53 +08:00
|
|
|
int &FrameIndex) const {
|
|
|
|
if (isFrameLoadOpcode(MI->getOpcode()))
|
2010-07-27 12:17:01 +08:00
|
|
|
if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
|
2006-02-03 04:12:32 +08:00
|
|
|
return MI->getOperand(0).getReg();
|
2009-11-13 08:29:53 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
|
2009-11-13 08:29:53 +08:00
|
|
|
int &FrameIndex) const {
|
|
|
|
if (isFrameLoadOpcode(MI->getOpcode())) {
|
|
|
|
unsigned Reg;
|
|
|
|
if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
|
|
|
|
return Reg;
|
2009-11-13 04:55:29 +08:00
|
|
|
// Check for post-frame index elimination operations
|
2009-12-05 06:38:46 +08:00
|
|
|
const MachineMemOperand *Dummy;
|
|
|
|
return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
|
2006-02-03 04:12:32 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-11-19 03:49:32 +08:00
|
|
|
unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
|
2006-02-03 04:12:32 +08:00
|
|
|
int &FrameIndex) const {
|
2009-11-13 08:29:53 +08:00
|
|
|
if (isFrameStoreOpcode(MI->getOpcode()))
|
2010-07-27 12:17:01 +08:00
|
|
|
if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
|
|
|
|
isFrameOperand(MI, 0, FrameIndex))
|
2010-07-09 06:41:28 +08:00
|
|
|
return MI->getOperand(X86::AddrNumOperands).getReg();
|
2009-11-13 08:29:53 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
|
|
|
|
int &FrameIndex) const {
|
|
|
|
if (isFrameStoreOpcode(MI->getOpcode())) {
|
|
|
|
unsigned Reg;
|
|
|
|
if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
|
|
|
|
return Reg;
|
2009-11-13 04:55:29 +08:00
|
|
|
// Check for post-frame index elimination operations
|
2009-12-05 06:38:46 +08:00
|
|
|
const MachineMemOperand *Dummy;
|
|
|
|
return hasStoreToStackSlot(MI, Dummy, FrameIndex);
|
2006-02-03 04:12:32 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
|
2008-07-08 07:14:23 +08:00
|
|
|
static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
|
2012-08-08 08:40:47 +08:00
|
|
|
// Don't waste compile time scanning use-def chains of physregs.
|
|
|
|
if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
|
|
|
|
return false;
|
2008-03-27 09:45:11 +08:00
|
|
|
bool isPICBase = false;
|
2014-03-14 07:12:04 +08:00
|
|
|
for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
|
|
|
|
E = MRI.def_instr_end(); I != E; ++I) {
|
|
|
|
MachineInstr *DefMI = &*I;
|
2008-03-27 09:45:11 +08:00
|
|
|
if (DefMI->getOpcode() != X86::MOVPC32r)
|
|
|
|
return false;
|
|
|
|
assert(!isPICBase && "More than one PIC base?");
|
|
|
|
isPICBase = true;
|
|
|
|
}
|
|
|
|
return isPICBase;
|
|
|
|
}
|
2008-03-31 15:54:19 +08:00
|
|
|
|
2008-05-13 04:54:26 +08:00
|
|
|
bool
|
2009-10-10 08:34:18 +08:00
|
|
|
X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
|
|
|
|
AliasAnalysis *AA) const {
|
2007-06-15 04:50:44 +08:00
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: break;
|
2012-08-21 16:17:07 +08:00
|
|
|
case X86::MOV8rm:
|
|
|
|
case X86::MOV16rm:
|
|
|
|
case X86::MOV32rm:
|
|
|
|
case X86::MOV64rm:
|
|
|
|
case X86::LD_Fp64m:
|
|
|
|
case X86::MOVSSrm:
|
|
|
|
case X86::MOVSDrm:
|
|
|
|
case X86::MOVAPSrm:
|
|
|
|
case X86::MOVUPSrm:
|
|
|
|
case X86::MOVAPDrm:
|
|
|
|
case X86::MOVDQArm:
|
2012-12-06 14:49:16 +08:00
|
|
|
case X86::MOVDQUrm:
|
2012-08-21 16:17:07 +08:00
|
|
|
case X86::VMOVSSrm:
|
|
|
|
case X86::VMOVSDrm:
|
|
|
|
case X86::VMOVAPSrm:
|
|
|
|
case X86::VMOVUPSrm:
|
|
|
|
case X86::VMOVAPDrm:
|
|
|
|
case X86::VMOVDQArm:
|
2012-12-06 14:49:16 +08:00
|
|
|
case X86::VMOVDQUrm:
|
2012-08-21 16:17:07 +08:00
|
|
|
case X86::VMOVAPSYrm:
|
|
|
|
case X86::VMOVUPSYrm:
|
|
|
|
case X86::VMOVAPDYrm:
|
|
|
|
case X86::VMOVDQAYrm:
|
2012-12-06 14:49:16 +08:00
|
|
|
case X86::VMOVDQUYrm:
|
2012-08-21 16:17:07 +08:00
|
|
|
case X86::MMX_MOVD64rm:
|
|
|
|
case X86::MMX_MOVQ64rm:
|
|
|
|
case X86::FsVMOVAPSrm:
|
|
|
|
case X86::FsVMOVAPDrm:
|
|
|
|
case X86::FsMOVAPSrm:
|
|
|
|
case X86::FsMOVAPDrm: {
|
|
|
|
// Loads from constant pools are trivially rematerializable.
|
2014-05-06 15:04:32 +08:00
|
|
|
if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
|
|
|
|
MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
|
|
|
|
MI->getOperand(1+X86::AddrIndexReg).isReg() &&
|
|
|
|
MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
|
2012-08-21 16:17:07 +08:00
|
|
|
MI->isInvariantLoad(AA)) {
|
2014-05-06 15:04:32 +08:00
|
|
|
unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
|
2012-08-21 16:17:07 +08:00
|
|
|
if (BaseReg == 0 || BaseReg == X86::RIP)
|
|
|
|
return true;
|
|
|
|
// Allow re-materialization of PIC load.
|
2014-05-06 15:04:32 +08:00
|
|
|
if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
|
2012-08-21 16:17:07 +08:00
|
|
|
return false;
|
|
|
|
const MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
return regIsPICBase(BaseReg, MRI);
|
2008-02-22 17:25:47 +08:00
|
|
|
}
|
2012-08-21 16:17:07 +08:00
|
|
|
return false;
|
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2012-08-21 16:17:07 +08:00
|
|
|
case X86::LEA32r:
|
|
|
|
case X86::LEA64r: {
|
2014-05-06 15:04:32 +08:00
|
|
|
if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
|
|
|
|
MI->getOperand(1+X86::AddrIndexReg).isReg() &&
|
|
|
|
MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
|
|
|
|
!MI->getOperand(1+X86::AddrDisp).isReg()) {
|
2012-08-21 16:17:07 +08:00
|
|
|
// lea fi#, lea GV, etc. are all rematerializable.
|
2014-05-06 15:04:32 +08:00
|
|
|
if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
|
2012-08-21 16:17:07 +08:00
|
|
|
return true;
|
2014-05-06 15:04:32 +08:00
|
|
|
unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
|
2012-08-21 16:17:07 +08:00
|
|
|
if (BaseReg == 0)
|
|
|
|
return true;
|
|
|
|
// Allow re-materialization of lea PICBase + x.
|
|
|
|
const MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
return regIsPICBase(BaseReg, MRI);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2007-06-15 04:50:44 +08:00
|
|
|
}
|
2008-03-27 09:41:09 +08:00
|
|
|
|
2007-06-26 08:48:07 +08:00
|
|
|
// All other instructions marked M_REMATERIALIZABLE are always trivially
|
|
|
|
// rematerializable.
|
|
|
|
return true;
|
2007-06-15 04:50:44 +08:00
|
|
|
}
|
|
|
|
|
2014-05-20 16:55:50 +08:00
|
|
|
bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator I) const {
|
2010-03-24 04:35:45 +08:00
|
|
|
MachineBasicBlock::iterator E = MBB.end();
|
|
|
|
|
2008-06-24 15:10:51 +08:00
|
|
|
// For compile time consideration, if we are not able to determine the
|
2009-10-14 08:08:59 +08:00
|
|
|
// safety after visiting 4 instructions in each direction, we will assume
|
|
|
|
// it's not safe.
|
|
|
|
MachineBasicBlock::iterator Iter = I;
|
2011-09-03 07:52:52 +08:00
|
|
|
for (unsigned i = 0; Iter != E && i < 4; ++i) {
|
2008-06-24 15:10:51 +08:00
|
|
|
bool SeenDef = false;
|
2009-10-14 08:08:59 +08:00
|
|
|
for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
|
|
|
|
MachineOperand &MO = Iter->getOperand(j);
|
2012-02-09 08:17:22 +08:00
|
|
|
if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
|
|
|
|
SeenDef = true;
|
2008-10-03 23:45:36 +08:00
|
|
|
if (!MO.isReg())
|
2008-06-24 15:10:51 +08:00
|
|
|
continue;
|
|
|
|
if (MO.getReg() == X86::EFLAGS) {
|
|
|
|
if (MO.isUse())
|
|
|
|
return false;
|
|
|
|
SeenDef = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (SeenDef)
|
|
|
|
// This instruction defines EFLAGS, no need to look any further.
|
|
|
|
return true;
|
2009-10-14 08:08:59 +08:00
|
|
|
++Iter;
|
2010-03-24 04:35:45 +08:00
|
|
|
// Skip over DBG_VALUE.
|
|
|
|
while (Iter != E && Iter->isDebugValue())
|
|
|
|
++Iter;
|
2011-09-03 07:52:52 +08:00
|
|
|
}
|
2008-10-21 11:24:31 +08:00
|
|
|
|
2011-09-03 07:52:52 +08:00
|
|
|
// It is safe to clobber EFLAGS at the end of a block of no successor has it
|
|
|
|
// live in.
|
|
|
|
if (Iter == E) {
|
|
|
|
for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
|
|
|
|
SE = MBB.succ_end(); SI != SE; ++SI)
|
|
|
|
if ((*SI)->isLiveIn(X86::EFLAGS))
|
|
|
|
return false;
|
|
|
|
return true;
|
2009-10-14 08:08:59 +08:00
|
|
|
}
|
|
|
|
|
2010-03-24 04:35:45 +08:00
|
|
|
MachineBasicBlock::iterator B = MBB.begin();
|
2009-10-14 08:08:59 +08:00
|
|
|
Iter = I;
|
|
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
|
|
// If we make it to the beginning of the block, it's safe to clobber
|
2012-09-27 18:14:43 +08:00
|
|
|
// EFLAGS iff EFLAGS is not live-in.
|
2010-03-24 04:35:45 +08:00
|
|
|
if (Iter == B)
|
2009-10-14 08:08:59 +08:00
|
|
|
return !MBB.isLiveIn(X86::EFLAGS);
|
|
|
|
|
|
|
|
--Iter;
|
2010-03-24 04:35:45 +08:00
|
|
|
// Skip over DBG_VALUE.
|
|
|
|
while (Iter != B && Iter->isDebugValue())
|
|
|
|
--Iter;
|
|
|
|
|
2009-10-14 08:08:59 +08:00
|
|
|
bool SawKill = false;
|
|
|
|
for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
|
|
|
|
MachineOperand &MO = Iter->getOperand(j);
|
2012-02-09 08:17:22 +08:00
|
|
|
// A register mask may clobber EFLAGS, but we should still look for a
|
|
|
|
// live EFLAGS def.
|
|
|
|
if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
|
|
|
|
SawKill = true;
|
2009-10-14 08:08:59 +08:00
|
|
|
if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
|
|
|
|
if (MO.isDef()) return MO.isDead();
|
|
|
|
if (MO.isKill()) SawKill = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (SawKill)
|
|
|
|
// This instruction kills EFLAGS and doesn't redefine it, so
|
|
|
|
// there's no need to look further.
|
2008-10-21 11:24:31 +08:00
|
|
|
return true;
|
2008-06-24 15:10:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Conservative answer.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2008-04-01 04:40:39 +08:00
|
|
|
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator I,
|
2009-07-16 17:20:10 +08:00
|
|
|
unsigned DestReg, unsigned SubIdx,
|
2009-11-14 10:55:43 +08:00
|
|
|
const MachineInstr *Orig,
|
2010-06-03 06:47:25 +08:00
|
|
|
const TargetRegisterInfo &TRI) const {
|
2013-05-30 21:19:42 +08:00
|
|
|
// MOV32r0 is implemented with a xor which clobbers condition code.
|
|
|
|
// Re-materialize it as movri instructions to avoid side effects.
|
2009-07-16 17:20:10 +08:00
|
|
|
unsigned Opc = Orig->getOpcode();
|
2013-05-30 21:19:42 +08:00
|
|
|
if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
|
|
|
|
DebugLoc DL = Orig->getDebugLoc();
|
|
|
|
BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
|
|
|
|
.addImm(0);
|
|
|
|
} else {
|
2008-07-08 07:14:23 +08:00
|
|
|
MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
|
2008-04-01 04:40:39 +08:00
|
|
|
MBB.insert(I, MI);
|
|
|
|
}
|
2008-04-17 07:44:44 +08:00
|
|
|
|
2014-03-02 20:27:27 +08:00
|
|
|
MachineInstr *NewMI = std::prev(I);
|
2010-06-03 06:47:25 +08:00
|
|
|
NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
|
2008-04-01 04:40:39 +08:00
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
|
2007-10-05 16:04:01 +08:00
|
|
|
static bool hasLiveCondCodeDef(MachineInstr *MI) {
|
|
|
|
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand &MO = MI->getOperand(i);
|
2008-10-03 23:45:36 +08:00
|
|
|
if (MO.isReg() && MO.isDef() &&
|
2007-10-05 16:04:01 +08:00
|
|
|
MO.getReg() == X86::EFLAGS && !MO.isDead()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Check whether the shift count for a machine operand is non-zero.
|
2013-05-22 16:13:02 +08:00
|
|
|
inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
|
|
|
|
unsigned ShiftAmtOperandIdx) {
|
|
|
|
// The shift count is six bits with the REX.W prefix and five bits without.
|
|
|
|
unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
|
|
|
|
unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
|
|
|
|
return Imm & ShiftCountMask;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Check whether the given shift count is appropriate
|
2013-05-22 16:13:02 +08:00
|
|
|
/// can be represented by a LEA instruction.
|
|
|
|
inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
|
|
|
|
// Left shift instructions can be transformed into load-effective-address
|
|
|
|
// instructions if we can encode them appropriately.
|
|
|
|
// A LEA instruction utilizes a SIB byte to encode it's scale factor.
|
|
|
|
// The SIB.scale field is two bits wide which means that we can encode any
|
|
|
|
// shift amount less than 4.
|
|
|
|
return ShAmt < 4 && ShAmt > 0;
|
|
|
|
}
|
|
|
|
|
2013-06-11 04:43:49 +08:00
|
|
|
bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
|
|
|
|
unsigned Opc, bool AllowSP,
|
|
|
|
unsigned &NewSrc, bool &isKill, bool &isUndef,
|
|
|
|
MachineOperand &ImplicitOp) const {
|
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
const TargetRegisterClass *RC;
|
|
|
|
if (AllowSP) {
|
|
|
|
RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
|
|
|
|
} else {
|
|
|
|
RC = Opc != X86::LEA32r ?
|
|
|
|
&X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
|
|
|
|
}
|
|
|
|
unsigned SrcReg = Src.getReg();
|
|
|
|
|
|
|
|
// For both LEA64 and LEA32 the register already has essentially the right
|
|
|
|
// type (32-bit or 64-bit) we may just need to forbid SP.
|
|
|
|
if (Opc != X86::LEA64_32r) {
|
|
|
|
NewSrc = SrcReg;
|
|
|
|
isKill = Src.isKill();
|
|
|
|
isUndef = Src.isUndef();
|
|
|
|
|
|
|
|
if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
|
|
|
|
!MF.getRegInfo().constrainRegClass(NewSrc, RC))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// This is for an LEA64_32r and incoming registers are 32-bit. One way or
|
|
|
|
// another we need to add 64-bit registers to the final MI.
|
|
|
|
if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
|
|
|
|
ImplicitOp = Src;
|
|
|
|
ImplicitOp.setImplicit();
|
|
|
|
|
|
|
|
NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
|
|
|
|
MachineBasicBlock::LivenessQueryResult LQR =
|
|
|
|
MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
|
|
|
|
|
|
|
|
switch (LQR) {
|
|
|
|
case MachineBasicBlock::LQR_Unknown:
|
|
|
|
// We can't give sane liveness flags to the instruction, abandon LEA
|
|
|
|
// formation.
|
|
|
|
return false;
|
|
|
|
case MachineBasicBlock::LQR_Live:
|
|
|
|
isKill = MI->killsRegister(SrcReg);
|
|
|
|
isUndef = false;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// The physreg itself is dead, so we have to use it as an <undef>.
|
|
|
|
isKill = false;
|
|
|
|
isUndef = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Virtual register of the wrong class, we have to create a temporary 64-bit
|
|
|
|
// vreg to feed into the LEA.
|
|
|
|
NewSrc = MF.getRegInfo().createVirtualRegister(RC);
|
|
|
|
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
|
|
|
|
get(TargetOpcode::COPY))
|
|
|
|
.addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
|
|
|
|
.addOperand(Src);
|
|
|
|
|
|
|
|
// Which is obviously going to be dead after we're done with it.
|
|
|
|
isKill = true;
|
|
|
|
isUndef = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We've set all the parameters without issue.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
|
|
|
|
/// LEA to form 3-address code by promoting to a 32-bit superregister and then
|
|
|
|
/// truncating back down to a 16-bit subregister.
|
2009-12-11 14:01:48 +08:00
|
|
|
MachineInstr *
|
|
|
|
X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
|
|
|
|
MachineFunction::iterator &MFI,
|
|
|
|
MachineBasicBlock::iterator &MBBI,
|
|
|
|
LiveVariables *LV) const {
|
|
|
|
MachineInstr *MI = MBBI;
|
|
|
|
unsigned Dest = MI->getOperand(0).getReg();
|
|
|
|
unsigned Src = MI->getOperand(1).getReg();
|
|
|
|
bool isDead = MI->getOperand(0).isDead();
|
|
|
|
bool isKill = MI->getOperand(1).isKill();
|
|
|
|
|
|
|
|
MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
|
|
|
|
unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
|
2013-06-11 04:43:49 +08:00
|
|
|
unsigned Opc, leaInReg;
|
2014-06-11 06:34:31 +08:00
|
|
|
if (Subtarget.is64Bit()) {
|
2013-06-11 04:43:49 +08:00
|
|
|
Opc = X86::LEA64_32r;
|
|
|
|
leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
|
|
|
|
} else {
|
|
|
|
Opc = X86::LEA32r;
|
|
|
|
leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
|
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2009-12-11 14:01:48 +08:00
|
|
|
// Build and insert into an implicit UNDEF value. This is OK because
|
2011-01-26 10:03:37 +08:00
|
|
|
// well be shifting and then extracting the lower 16-bits.
|
2009-12-13 04:03:14 +08:00
|
|
|
// This has the potential to cause partial register stall. e.g.
|
2009-12-13 02:55:26 +08:00
|
|
|
// movw (%rbp,%rcx,2), %dx
|
|
|
|
// leal -65(%rdx), %esi
|
2009-12-13 04:03:14 +08:00
|
|
|
// But testing has shown this *does* help performance in 64-bit mode (at
|
|
|
|
// least on modern x86 machines).
|
2009-12-11 14:01:48 +08:00
|
|
|
BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
|
|
|
|
MachineInstr *InsMI =
|
2010-07-09 00:40:15 +08:00
|
|
|
BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
|
|
|
|
.addReg(leaInReg, RegState::Define, X86::sub_16bit)
|
|
|
|
.addReg(Src, getKillRegState(isKill));
|
2009-12-11 14:01:48 +08:00
|
|
|
|
|
|
|
MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
|
|
|
|
get(Opc), leaOutReg);
|
|
|
|
switch (MIOpc) {
|
2012-08-21 16:16:16 +08:00
|
|
|
default: llvm_unreachable("Unreachable!");
|
2009-12-11 14:01:48 +08:00
|
|
|
case X86::SHL16ri: {
|
|
|
|
unsigned ShAmt = MI->getOperand(2).getImm();
|
|
|
|
MIB.addReg(0).addImm(1 << ShAmt)
|
2010-07-09 07:46:44 +08:00
|
|
|
.addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
|
2009-12-11 14:01:48 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case X86::INC16r:
|
2010-07-09 07:46:44 +08:00
|
|
|
addRegOffset(MIB, leaInReg, true, 1);
|
2009-12-11 14:01:48 +08:00
|
|
|
break;
|
|
|
|
case X86::DEC16r:
|
2010-07-09 07:46:44 +08:00
|
|
|
addRegOffset(MIB, leaInReg, true, -1);
|
2009-12-11 14:01:48 +08:00
|
|
|
break;
|
|
|
|
case X86::ADD16ri:
|
|
|
|
case X86::ADD16ri8:
|
2010-10-08 11:57:25 +08:00
|
|
|
case X86::ADD16ri_DB:
|
|
|
|
case X86::ADD16ri8_DB:
|
2011-01-26 10:03:37 +08:00
|
|
|
addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
|
2009-12-11 14:01:48 +08:00
|
|
|
break;
|
2010-10-08 11:54:52 +08:00
|
|
|
case X86::ADD16rr:
|
|
|
|
case X86::ADD16rr_DB: {
|
2009-12-11 14:01:48 +08:00
|
|
|
unsigned Src2 = MI->getOperand(2).getReg();
|
|
|
|
bool isKill2 = MI->getOperand(2).isKill();
|
|
|
|
unsigned leaInReg2 = 0;
|
2014-04-25 13:30:21 +08:00
|
|
|
MachineInstr *InsMI2 = nullptr;
|
2009-12-11 14:01:48 +08:00
|
|
|
if (Src == Src2) {
|
|
|
|
// ADD16rr %reg1028<kill>, %reg1028
|
|
|
|
// just a single insert_subreg.
|
|
|
|
addRegReg(MIB, leaInReg, true, leaInReg, false);
|
|
|
|
} else {
|
2014-06-11 06:34:31 +08:00
|
|
|
if (Subtarget.is64Bit())
|
2013-06-11 04:43:49 +08:00
|
|
|
leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
|
|
|
|
else
|
|
|
|
leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
|
2009-12-11 14:01:48 +08:00
|
|
|
// Build and insert into an implicit UNDEF value. This is OK because
|
2011-01-26 10:03:37 +08:00
|
|
|
// well be shifting and then extracting the lower 16-bits.
|
2011-12-14 10:11:42 +08:00
|
|
|
BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
|
2009-12-11 14:01:48 +08:00
|
|
|
InsMI2 =
|
2011-12-14 10:11:42 +08:00
|
|
|
BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
|
2010-07-09 00:40:15 +08:00
|
|
|
.addReg(leaInReg2, RegState::Define, X86::sub_16bit)
|
|
|
|
.addReg(Src2, getKillRegState(isKill2));
|
2009-12-11 14:01:48 +08:00
|
|
|
addRegReg(MIB, leaInReg, true, leaInReg2, true);
|
|
|
|
}
|
|
|
|
if (LV && isKill2 && InsMI2)
|
|
|
|
LV->replaceKillInstruction(Src2, MI, InsMI2);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineInstr *NewMI = MIB;
|
|
|
|
MachineInstr *ExtMI =
|
2010-07-09 00:40:22 +08:00
|
|
|
BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
|
2009-12-11 14:01:48 +08:00
|
|
|
.addReg(Dest, RegState::Define | getDeadRegState(isDead))
|
2010-07-09 00:40:22 +08:00
|
|
|
.addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
|
2009-12-11 14:01:48 +08:00
|
|
|
|
|
|
|
if (LV) {
|
|
|
|
// Update live variables
|
|
|
|
LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
|
|
|
|
LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
|
|
|
|
if (isKill)
|
|
|
|
LV->replaceKillInstruction(Src, MI, InsMI);
|
|
|
|
if (isDead)
|
|
|
|
LV->replaceKillInstruction(Dest, MI, ExtMI);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ExtMI;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// This method must be implemented by targets that
|
2005-01-02 10:37:07 +08:00
|
|
|
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
|
|
|
|
/// may be able to convert a two-address instruction into a true
|
|
|
|
/// three-address instruction on demand. This allows the X86 target (for
|
|
|
|
/// example) to convert ADD and SHL instructions into LEA instructions if they
|
|
|
|
/// would require register copies due to two-addressness.
|
|
|
|
///
|
|
|
|
/// This method returns a null pointer if the transformation cannot be
|
|
|
|
/// performed, otherwise it returns the new instruction.
|
|
|
|
///
|
2006-12-02 05:52:41 +08:00
|
|
|
MachineInstr *
|
|
|
|
X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
|
|
|
|
MachineBasicBlock::iterator &MBBI,
|
2008-07-03 07:41:07 +08:00
|
|
|
LiveVariables *LV) const {
|
2006-12-02 05:52:41 +08:00
|
|
|
MachineInstr *MI = MBBI;
|
2013-05-22 16:13:02 +08:00
|
|
|
|
|
|
|
// The following opcodes also sets the condition code register(s). Only
|
|
|
|
// convert them to equivalent lea if the condition code register def's
|
|
|
|
// are dead!
|
|
|
|
if (hasLiveCondCodeDef(MI))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2013-05-22 16:13:02 +08:00
|
|
|
|
2008-07-08 07:14:23 +08:00
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
2005-01-02 10:37:07 +08:00
|
|
|
// All instructions input are two-addr instructions. Get the known operands.
|
2012-08-24 06:36:31 +08:00
|
|
|
const MachineOperand &Dest = MI->getOperand(0);
|
|
|
|
const MachineOperand &Src = MI->getOperand(1);
|
2005-01-02 10:37:07 +08:00
|
|
|
|
2014-04-25 13:30:21 +08:00
|
|
|
MachineInstr *NewMI = nullptr;
|
2006-12-02 05:52:41 +08:00
|
|
|
// FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
|
Two changes:
1) codegen a shift of a register as a shift, not an LEA.
2) teach the RA to convert a shift to an LEA instruction if it wants something
in three-address form.
This gives us asm diffs like:
- leal (,%eax,4), %eax
+ shll $2, %eax
which is faster on some processors and smaller on all of them.
and, more interestingly:
- movl 24(%esi), %eax
- leal (,%eax,4), %edi
+ movl 24(%esi), %edi
+ shll $2, %edi
Without #2, #1 was a significant pessimization in some cases.
This implements CodeGen/X86/shift-codegen.ll
llvm-svn: 35204
2007-03-20 14:08:29 +08:00
|
|
|
// we have better subtarget support, enable the 16-bit LEA generation here.
|
2009-12-13 04:03:14 +08:00
|
|
|
// 16-bit LEA is also slow on Core2.
|
2006-12-02 05:52:41 +08:00
|
|
|
bool DisableLEA16 = true;
|
2014-06-11 06:34:31 +08:00
|
|
|
bool is64Bit = Subtarget.is64Bit();
|
2006-12-02 05:52:41 +08:00
|
|
|
|
2007-10-06 04:34:26 +08:00
|
|
|
unsigned MIOpc = MI->getOpcode();
|
|
|
|
switch (MIOpc) {
|
2015-01-07 16:10:38 +08:00
|
|
|
default: return nullptr;
|
2007-03-29 02:12:31 +08:00
|
|
|
case X86::SHL64ri: {
|
2007-09-15 05:48:26 +08:00
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
|
2013-05-22 16:13:02 +08:00
|
|
|
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
|
2008-07-03 17:09:37 +08:00
|
|
|
|
2010-10-07 08:07:26 +08:00
|
|
|
// LEA can't handle RSP.
|
2012-08-24 06:36:31 +08:00
|
|
|
if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
|
|
|
|
!MF.getRegInfo().constrainRegClass(Src.getReg(),
|
|
|
|
&X86::GR64_NOSPRegClass))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2010-10-07 08:07:26 +08:00
|
|
|
|
2009-02-12 05:51:19 +08:00
|
|
|
NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
|
2012-08-24 06:36:31 +08:00
|
|
|
.addOperand(Dest)
|
|
|
|
.addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
|
2007-03-29 02:12:31 +08:00
|
|
|
break;
|
|
|
|
}
|
Two changes:
1) codegen a shift of a register as a shift, not an LEA.
2) teach the RA to convert a shift to an LEA instruction if it wants something
in three-address form.
This gives us asm diffs like:
- leal (,%eax,4), %eax
+ shll $2, %eax
which is faster on some processors and smaller on all of them.
and, more interestingly:
- movl 24(%esi), %eax
- leal (,%eax,4), %edi
+ movl 24(%esi), %edi
+ shll $2, %edi
Without #2, #1 was a significant pessimization in some cases.
This implements CodeGen/X86/shift-codegen.ll
llvm-svn: 35204
2007-03-20 14:08:29 +08:00
|
|
|
case X86::SHL32ri: {
|
2007-09-15 05:48:26 +08:00
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
|
2013-05-22 16:13:02 +08:00
|
|
|
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
|
2008-07-03 17:09:37 +08:00
|
|
|
|
2013-06-11 04:43:49 +08:00
|
|
|
unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
|
|
|
|
|
2010-10-07 08:07:26 +08:00
|
|
|
// LEA can't handle ESP.
|
2013-06-11 04:43:49 +08:00
|
|
|
bool isKill, isUndef;
|
|
|
|
unsigned SrcReg;
|
|
|
|
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
|
|
|
|
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
|
|
|
|
SrcReg, isKill, isUndef, ImplicitOp))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2010-10-07 08:07:26 +08:00
|
|
|
|
2013-06-11 04:43:49 +08:00
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
|
2012-08-24 06:36:31 +08:00
|
|
|
.addOperand(Dest)
|
2013-06-11 04:43:49 +08:00
|
|
|
.addReg(0).addImm(1 << ShAmt)
|
|
|
|
.addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
|
|
|
|
.addImm(0).addReg(0);
|
|
|
|
if (ImplicitOp.getReg() != 0)
|
|
|
|
MIB.addOperand(ImplicitOp);
|
|
|
|
NewMI = MIB;
|
|
|
|
|
Two changes:
1) codegen a shift of a register as a shift, not an LEA.
2) teach the RA to convert a shift to an LEA instruction if it wants something
in three-address form.
This gives us asm diffs like:
- leal (,%eax,4), %eax
+ shll $2, %eax
which is faster on some processors and smaller on all of them.
and, more interestingly:
- movl 24(%esi), %eax
- leal (,%eax,4), %edi
+ movl 24(%esi), %edi
+ shll $2, %edi
Without #2, #1 was a significant pessimization in some cases.
This implements CodeGen/X86/shift-codegen.ll
llvm-svn: 35204
2007-03-20 14:08:29 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case X86::SHL16ri: {
|
2007-09-15 05:48:26 +08:00
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
|
2013-05-22 16:13:02 +08:00
|
|
|
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
|
2008-07-03 17:09:37 +08:00
|
|
|
|
2009-12-11 14:01:48 +08:00
|
|
|
if (DisableLEA16)
|
2014-04-25 13:30:21 +08:00
|
|
|
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
|
2009-12-11 14:01:48 +08:00
|
|
|
NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
|
2012-08-24 06:36:31 +08:00
|
|
|
.addOperand(Dest)
|
|
|
|
.addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
|
Two changes:
1) codegen a shift of a register as a shift, not an LEA.
2) teach the RA to convert a shift to an LEA instruction if it wants something
in three-address form.
This gives us asm diffs like:
- leal (,%eax,4), %eax
+ shll $2, %eax
which is faster on some processors and smaller on all of them.
and, more interestingly:
- movl 24(%esi), %eax
- leal (,%eax,4), %edi
+ movl 24(%esi), %edi
+ shll $2, %edi
Without #2, #1 was a significant pessimization in some cases.
This implements CodeGen/X86/shift-codegen.ll
llvm-svn: 35204
2007-03-20 14:08:29 +08:00
|
|
|
break;
|
2006-05-31 04:26:50 +08:00
|
|
|
}
|
2015-01-07 16:10:38 +08:00
|
|
|
case X86::INC64r:
|
|
|
|
case X86::INC32r: {
|
|
|
|
assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
|
|
|
|
unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
|
|
|
|
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
|
|
|
|
bool isKill, isUndef;
|
|
|
|
unsigned SrcReg;
|
|
|
|
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
|
|
|
|
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
|
|
|
|
SrcReg, isKill, isUndef, ImplicitOp))
|
|
|
|
return nullptr;
|
2013-06-11 04:43:49 +08:00
|
|
|
|
2015-01-07 16:10:38 +08:00
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
|
|
|
|
.addOperand(Dest)
|
|
|
|
.addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
|
|
|
|
if (ImplicitOp.getReg() != 0)
|
|
|
|
MIB.addOperand(ImplicitOp);
|
|
|
|
|
|
|
|
NewMI = addOffset(MIB, 1);
|
|
|
|
break;
|
2005-01-02 10:37:07 +08:00
|
|
|
}
|
2015-01-07 16:10:38 +08:00
|
|
|
case X86::INC16r:
|
|
|
|
if (DisableLEA16)
|
|
|
|
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
|
|
|
|
: nullptr;
|
|
|
|
assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
|
|
|
|
NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
|
|
|
|
.addOperand(Dest).addOperand(Src), 1);
|
|
|
|
break;
|
|
|
|
case X86::DEC64r:
|
|
|
|
case X86::DEC32r: {
|
|
|
|
assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
|
|
|
|
unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
|
|
|
|
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
|
|
|
|
|
|
|
|
bool isKill, isUndef;
|
|
|
|
unsigned SrcReg;
|
|
|
|
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
|
|
|
|
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
|
|
|
|
SrcReg, isKill, isUndef, ImplicitOp))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
|
|
|
|
.addOperand(Dest)
|
|
|
|
.addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
|
|
|
|
if (ImplicitOp.getReg() != 0)
|
|
|
|
MIB.addOperand(ImplicitOp);
|
|
|
|
|
|
|
|
NewMI = addOffset(MIB, -1);
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case X86::DEC16r:
|
|
|
|
if (DisableLEA16)
|
|
|
|
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
|
|
|
|
: nullptr;
|
|
|
|
assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
|
|
|
|
NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
|
|
|
|
.addOperand(Dest).addOperand(Src), -1);
|
|
|
|
break;
|
|
|
|
case X86::ADD64rr:
|
|
|
|
case X86::ADD64rr_DB:
|
|
|
|
case X86::ADD32rr:
|
|
|
|
case X86::ADD32rr_DB: {
|
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
|
|
|
|
unsigned Opc;
|
|
|
|
if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
|
|
|
|
Opc = X86::LEA64r;
|
|
|
|
else
|
|
|
|
Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
|
|
|
|
|
|
|
|
bool isKill, isUndef;
|
|
|
|
unsigned SrcReg;
|
|
|
|
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
|
|
|
|
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
|
|
|
|
SrcReg, isKill, isUndef, ImplicitOp))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
const MachineOperand &Src2 = MI->getOperand(2);
|
|
|
|
bool isKill2, isUndef2;
|
|
|
|
unsigned SrcReg2;
|
|
|
|
MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
|
|
|
|
if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
|
|
|
|
SrcReg2, isKill2, isUndef2, ImplicitOp2))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
|
|
|
|
.addOperand(Dest);
|
|
|
|
if (ImplicitOp.getReg() != 0)
|
|
|
|
MIB.addOperand(ImplicitOp);
|
|
|
|
if (ImplicitOp2.getReg() != 0)
|
|
|
|
MIB.addOperand(ImplicitOp2);
|
|
|
|
|
|
|
|
NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
|
|
|
|
|
|
|
|
// Preserve undefness of the operands.
|
|
|
|
NewMI->getOperand(1).setIsUndef(isUndef);
|
|
|
|
NewMI->getOperand(3).setIsUndef(isUndef2);
|
|
|
|
|
|
|
|
if (LV && Src2.isKill())
|
|
|
|
LV->replaceKillInstruction(SrcReg2, MI, NewMI);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case X86::ADD16rr:
|
|
|
|
case X86::ADD16rr_DB: {
|
|
|
|
if (DisableLEA16)
|
|
|
|
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
|
|
|
|
: nullptr;
|
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
|
|
|
|
unsigned Src2 = MI->getOperand(2).getReg();
|
|
|
|
bool isKill2 = MI->getOperand(2).isKill();
|
|
|
|
NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
|
|
|
|
.addOperand(Dest),
|
|
|
|
Src.getReg(), Src.isKill(), Src2, isKill2);
|
|
|
|
|
|
|
|
// Preserve undefness of the operands.
|
|
|
|
bool isUndef = MI->getOperand(1).isUndef();
|
|
|
|
bool isUndef2 = MI->getOperand(2).isUndef();
|
|
|
|
NewMI->getOperand(1).setIsUndef(isUndef);
|
|
|
|
NewMI->getOperand(3).setIsUndef(isUndef2);
|
|
|
|
|
|
|
|
if (LV && isKill2)
|
|
|
|
LV->replaceKillInstruction(Src2, MI, NewMI);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case X86::ADD64ri32:
|
|
|
|
case X86::ADD64ri8:
|
|
|
|
case X86::ADD64ri32_DB:
|
|
|
|
case X86::ADD64ri8_DB:
|
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
|
|
|
|
NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
|
|
|
|
.addOperand(Dest).addOperand(Src),
|
|
|
|
MI->getOperand(2).getImm());
|
|
|
|
break;
|
|
|
|
case X86::ADD32ri:
|
|
|
|
case X86::ADD32ri8:
|
|
|
|
case X86::ADD32ri_DB:
|
|
|
|
case X86::ADD32ri8_DB: {
|
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
|
|
|
|
unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
|
|
|
|
|
|
|
|
bool isKill, isUndef;
|
|
|
|
unsigned SrcReg;
|
|
|
|
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
|
|
|
|
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
|
|
|
|
SrcReg, isKill, isUndef, ImplicitOp))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
|
|
|
|
.addOperand(Dest)
|
|
|
|
.addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
|
|
|
|
if (ImplicitOp.getReg() != 0)
|
|
|
|
MIB.addOperand(ImplicitOp);
|
|
|
|
|
|
|
|
NewMI = addOffset(MIB, MI->getOperand(2).getImm());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case X86::ADD16ri:
|
|
|
|
case X86::ADD16ri8:
|
|
|
|
case X86::ADD16ri_DB:
|
|
|
|
case X86::ADD16ri8_DB:
|
|
|
|
if (DisableLEA16)
|
|
|
|
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
|
|
|
|
: nullptr;
|
|
|
|
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
|
|
|
|
NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
|
|
|
|
.addOperand(Dest).addOperand(Src),
|
|
|
|
MI->getOperand(2).getImm());
|
|
|
|
break;
|
2006-12-02 05:52:41 +08:00
|
|
|
}
|
2007-10-06 04:34:26 +08:00
|
|
|
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!NewMI) return nullptr;
|
2008-02-07 16:29:53 +08:00
|
|
|
|
2008-07-03 17:09:37 +08:00
|
|
|
if (LV) { // Update live variables
|
2012-08-24 06:36:31 +08:00
|
|
|
if (Src.isKill())
|
|
|
|
LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
|
|
|
|
if (Dest.isDead())
|
|
|
|
LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
|
2008-07-03 17:09:37 +08:00
|
|
|
}
|
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
MFI->insert(MBBI, NewMI); // Insert the new inst
|
2006-11-16 04:58:11 +08:00
|
|
|
return NewMI;
|
2005-01-02 10:37:07 +08:00
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// We have a few instructions that must be hacked on to commute them.
|
Teach the code generator that shrd/shld is commutable if it has an immediate.
This allows us to generate this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shld %EDX, %EDX, 2
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
Note the magically transmogrifying immediate.
llvm-svn: 19686
2005-01-19 15:11:01 +08:00
|
|
|
///
|
2008-06-16 15:33:11 +08:00
|
|
|
MachineInstr *
|
|
|
|
X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
|
Teach the code generator that shrd/shld is commutable if it has an immediate.
This allows us to generate this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shld %EDX, %EDX, 2
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
Note the magically transmogrifying immediate.
llvm-svn: 19686
2005-01-19 15:11:01 +08:00
|
|
|
switch (MI->getOpcode()) {
|
2005-01-19 15:31:24 +08:00
|
|
|
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
|
|
|
|
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
|
Teach the code generator that shrd/shld is commutable if it has an immediate.
This allows us to generate this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shld %EDX, %EDX, 2
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
Note the magically transmogrifying immediate.
llvm-svn: 19686
2005-01-19 15:11:01 +08:00
|
|
|
case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
|
2007-09-15 07:17:45 +08:00
|
|
|
case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
|
|
|
|
case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
|
|
|
|
case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
|
2005-01-19 15:31:24 +08:00
|
|
|
unsigned Opc;
|
|
|
|
unsigned Size;
|
|
|
|
switch (MI->getOpcode()) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Unreachable!");
|
2005-01-19 15:31:24 +08:00
|
|
|
case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
|
|
|
|
case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
|
|
|
|
case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
|
|
|
|
case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
|
2007-09-15 07:17:45 +08:00
|
|
|
case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
|
|
|
|
case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
|
2005-01-19 15:31:24 +08:00
|
|
|
}
|
2007-12-31 04:49:49 +08:00
|
|
|
unsigned Amt = MI->getOperand(3).getImm();
|
2008-10-17 09:23:35 +08:00
|
|
|
if (NewMI) {
|
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
MI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI = false;
|
2008-02-13 10:46:49 +08:00
|
|
|
}
|
2008-10-17 09:23:35 +08:00
|
|
|
MI->setDesc(get(Opc));
|
|
|
|
MI->getOperand(3).setImm(Size-Amt);
|
2012-11-28 10:35:17 +08:00
|
|
|
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
Teach the code generator that shrd/shld is commutable if it has an immediate.
This allows us to generate this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shld %EDX, %EDX, 2
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
Note the magically transmogrifying immediate.
llvm-svn: 19686
2005-01-19 15:11:01 +08:00
|
|
|
}
|
2014-11-05 07:25:08 +08:00
|
|
|
case X86::BLENDPDrri:
|
|
|
|
case X86::BLENDPSrri:
|
|
|
|
case X86::PBLENDWrri:
|
|
|
|
case X86::VBLENDPDrri:
|
|
|
|
case X86::VBLENDPSrri:
|
|
|
|
case X86::VBLENDPDYrri:
|
|
|
|
case X86::VBLENDPSYrri:
|
|
|
|
case X86::VPBLENDDrri:
|
|
|
|
case X86::VPBLENDWrri:
|
|
|
|
case X86::VPBLENDDYrri:
|
|
|
|
case X86::VPBLENDWYrri:{
|
|
|
|
unsigned Mask;
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: llvm_unreachable("Unreachable!");
|
|
|
|
case X86::BLENDPDrri: Mask = 0x03; break;
|
|
|
|
case X86::BLENDPSrri: Mask = 0x0F; break;
|
|
|
|
case X86::PBLENDWrri: Mask = 0xFF; break;
|
|
|
|
case X86::VBLENDPDrri: Mask = 0x03; break;
|
|
|
|
case X86::VBLENDPSrri: Mask = 0x0F; break;
|
|
|
|
case X86::VBLENDPDYrri: Mask = 0x0F; break;
|
|
|
|
case X86::VBLENDPSYrri: Mask = 0xFF; break;
|
|
|
|
case X86::VPBLENDDrri: Mask = 0x0F; break;
|
|
|
|
case X86::VPBLENDWrri: Mask = 0xFF; break;
|
|
|
|
case X86::VPBLENDDYrri: Mask = 0xFF; break;
|
|
|
|
case X86::VPBLENDWYrri: Mask = 0xFF; break;
|
|
|
|
}
|
[X86] When commuting SSE immediate blend, make sure that the new blend mask is a valid imm8.
Example:
define <4 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
ret <4 x i32> %shuffle
}
Before llc (-mattr=+sse4.1), produced the following assembly instruction:
pblendw $4294967103, %xmm1, %xmm0
After
pblendw $63, %xmm1, %xmm0
llvm-svn: 221455
2014-11-06 22:36:45 +08:00
|
|
|
// Only the least significant bits of Imm are used.
|
|
|
|
unsigned Imm = MI->getOperand(3).getImm() & Mask;
|
2014-11-05 07:25:08 +08:00
|
|
|
if (NewMI) {
|
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
MI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI = false;
|
|
|
|
}
|
|
|
|
MI->getOperand(3).setImm(Mask ^ Imm);
|
|
|
|
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
|
|
|
}
|
2015-01-27 06:00:18 +08:00
|
|
|
case X86::PCLMULQDQrr:
|
|
|
|
case X86::VPCLMULQDQrr:{
|
|
|
|
// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
|
|
|
|
// SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
|
|
|
|
unsigned Imm = MI->getOperand(3).getImm();
|
|
|
|
unsigned Src1Hi = Imm & 0x01;
|
|
|
|
unsigned Src2Hi = Imm & 0x10;
|
|
|
|
if (NewMI) {
|
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
MI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI = false;
|
|
|
|
}
|
|
|
|
MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
|
|
|
|
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
|
|
|
}
|
2015-01-27 06:29:24 +08:00
|
|
|
case X86::CMPPDrri:
|
|
|
|
case X86::CMPPSrri:
|
|
|
|
case X86::VCMPPDrri:
|
|
|
|
case X86::VCMPPSrri:
|
|
|
|
case X86::VCMPPDYrri:
|
|
|
|
case X86::VCMPPSYrri: {
|
|
|
|
// Float comparison can be safely commuted for
|
|
|
|
// Ordered/Unordered/Equal/NotEqual tests
|
|
|
|
unsigned Imm = MI->getOperand(3).getImm() & 0x7;
|
|
|
|
switch (Imm) {
|
|
|
|
case 0x00: // EQUAL
|
|
|
|
case 0x03: // UNORDERED
|
|
|
|
case 0x04: // NOT EQUAL
|
|
|
|
case 0x07: // ORDERED
|
|
|
|
if (NewMI) {
|
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
MI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI = false;
|
|
|
|
}
|
|
|
|
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
|
|
|
default:
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
2015-02-15 06:40:46 +08:00
|
|
|
case X86::VPCOMBri: case X86::VPCOMUBri:
|
|
|
|
case X86::VPCOMDri: case X86::VPCOMUDri:
|
|
|
|
case X86::VPCOMQri: case X86::VPCOMUQri:
|
|
|
|
case X86::VPCOMWri: case X86::VPCOMUWri: {
|
|
|
|
// Flip comparison mode immediate (if necessary).
|
|
|
|
unsigned Imm = MI->getOperand(3).getImm() & 0x7;
|
|
|
|
switch (Imm) {
|
|
|
|
case 0x00: Imm = 0x02; break; // LT -> GT
|
|
|
|
case 0x01: Imm = 0x03; break; // LE -> GE
|
|
|
|
case 0x02: Imm = 0x00; break; // GT -> LT
|
|
|
|
case 0x03: Imm = 0x01; break; // GE -> LE
|
|
|
|
case 0x04: // EQ
|
|
|
|
case 0x05: // NE
|
|
|
|
case 0x06: // FALSE
|
|
|
|
case 0x07: // TRUE
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (NewMI) {
|
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
MI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI = false;
|
|
|
|
}
|
|
|
|
MI->getOperand(3).setImm(Imm);
|
|
|
|
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
|
|
|
}
|
2012-08-21 15:32:16 +08:00
|
|
|
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
|
|
|
|
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
|
|
|
|
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
|
|
|
|
case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
|
|
|
|
case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
|
|
|
|
case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
|
|
|
|
case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
|
|
|
|
case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
|
|
|
|
case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
|
|
|
|
case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
|
|
|
|
case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
|
|
|
|
case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
|
|
|
|
case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
|
|
|
|
case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
|
|
|
|
case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
|
|
|
|
case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
|
|
|
|
unsigned Opc;
|
2007-10-06 07:13:21 +08:00
|
|
|
switch (MI->getOpcode()) {
|
2012-08-21 15:32:16 +08:00
|
|
|
default: llvm_unreachable("Unreachable!");
|
2007-10-06 07:13:21 +08:00
|
|
|
case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
|
|
|
|
case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
|
|
|
|
case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
|
|
|
|
case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
|
|
|
|
case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
|
|
|
|
case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
|
|
|
|
case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
|
|
|
|
case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
|
|
|
|
case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
|
|
|
|
case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
|
|
|
|
case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
|
|
|
|
case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
|
2010-10-06 07:00:14 +08:00
|
|
|
case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
|
|
|
|
case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
|
|
|
|
case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
|
|
|
|
case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
|
|
|
|
case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
|
|
|
|
case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
|
2007-10-06 07:13:21 +08:00
|
|
|
case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
|
|
|
|
case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
|
|
|
|
case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
|
|
|
|
case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
|
|
|
|
case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
|
|
|
|
case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
|
|
|
|
case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
|
|
|
|
case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
|
|
|
|
case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
|
|
|
|
case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
|
|
|
|
case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
|
|
|
|
case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
|
|
|
|
case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
|
|
|
|
case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
|
2009-04-18 13:16:01 +08:00
|
|
|
case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
|
2007-10-06 07:13:21 +08:00
|
|
|
case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
|
|
|
|
case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
|
|
|
|
case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
|
|
|
|
case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
|
|
|
|
case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
|
2009-04-18 13:16:01 +08:00
|
|
|
case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
|
2007-10-06 07:13:21 +08:00
|
|
|
case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
|
|
|
|
case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
|
|
|
|
case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
|
2009-01-07 08:35:10 +08:00
|
|
|
case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
|
|
|
|
case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
|
2009-04-18 13:16:01 +08:00
|
|
|
case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
|
2009-01-07 08:35:10 +08:00
|
|
|
case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
|
|
|
|
case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
|
|
|
|
case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
|
2007-10-06 07:13:21 +08:00
|
|
|
}
|
2008-10-17 09:23:35 +08:00
|
|
|
if (NewMI) {
|
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
|
|
|
MI = MF.CloneMachineInstr(MI);
|
|
|
|
NewMI = false;
|
|
|
|
}
|
2008-01-12 02:10:50 +08:00
|
|
|
MI->setDesc(get(Opc));
|
2014-04-03 07:57:49 +08:00
|
|
|
// Fallthrough intended.
|
2007-10-06 07:13:21 +08:00
|
|
|
}
|
Teach the code generator that shrd/shld is commutable if it has an immediate.
This allows us to generate this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shld %EDX, %EDX, 2
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
Note the magically transmogrifying immediate.
llvm-svn: 19686
2005-01-19 15:11:01 +08:00
|
|
|
default:
|
2012-11-28 10:35:17 +08:00
|
|
|
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
Teach the code generator that shrd/shld is commutable if it has an immediate.
This allows us to generate this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shld %EDX, %EDX, 2
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
Note the magically transmogrifying immediate.
llvm-svn: 19686
2005-01-19 15:11:01 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-03 07:57:49 +08:00
|
|
|
bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
|
|
|
|
unsigned &SrcOpIdx2) const {
|
|
|
|
switch (MI->getOpcode()) {
|
2015-01-27 06:29:24 +08:00
|
|
|
case X86::CMPPDrri:
|
|
|
|
case X86::CMPPSrri:
|
|
|
|
case X86::VCMPPDrri:
|
|
|
|
case X86::VCMPPSrri:
|
|
|
|
case X86::VCMPPDYrri:
|
|
|
|
case X86::VCMPPSYrri: {
|
|
|
|
// Float comparison can be safely commuted for
|
|
|
|
// Ordered/Unordered/Equal/NotEqual tests
|
|
|
|
unsigned Imm = MI->getOperand(3).getImm() & 0x7;
|
|
|
|
switch (Imm) {
|
|
|
|
case 0x00: // EQUAL
|
|
|
|
case 0x03: // UNORDERED
|
|
|
|
case 0x04: // NOT EQUAL
|
|
|
|
case 0x07: // ORDERED
|
|
|
|
SrcOpIdx1 = 1;
|
|
|
|
SrcOpIdx2 = 2;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2014-04-03 07:57:49 +08:00
|
|
|
case X86::VFMADDPDr231r:
|
|
|
|
case X86::VFMADDPSr231r:
|
|
|
|
case X86::VFMADDSDr231r:
|
|
|
|
case X86::VFMADDSSr231r:
|
|
|
|
case X86::VFMSUBPDr231r:
|
|
|
|
case X86::VFMSUBPSr231r:
|
|
|
|
case X86::VFMSUBSDr231r:
|
|
|
|
case X86::VFMSUBSSr231r:
|
|
|
|
case X86::VFNMADDPDr231r:
|
|
|
|
case X86::VFNMADDPSr231r:
|
|
|
|
case X86::VFNMADDSDr231r:
|
|
|
|
case X86::VFNMADDSSr231r:
|
|
|
|
case X86::VFNMSUBPDr231r:
|
|
|
|
case X86::VFNMSUBPSr231r:
|
|
|
|
case X86::VFNMSUBSDr231r:
|
|
|
|
case X86::VFNMSUBSSr231r:
|
|
|
|
case X86::VFMADDPDr231rY:
|
|
|
|
case X86::VFMADDPSr231rY:
|
|
|
|
case X86::VFMSUBPDr231rY:
|
|
|
|
case X86::VFMSUBPSr231rY:
|
|
|
|
case X86::VFNMADDPDr231rY:
|
|
|
|
case X86::VFNMADDPSr231rY:
|
|
|
|
case X86::VFNMSUBPDr231rY:
|
|
|
|
case X86::VFNMSUBPSr231rY:
|
|
|
|
SrcOpIdx1 = 2;
|
|
|
|
SrcOpIdx2 = 3;
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-10 02:57:12 +08:00
|
|
|
static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
|
2006-10-21 01:42:20 +08:00
|
|
|
switch (BrOpc) {
|
|
|
|
default: return X86::COND_INVALID;
|
2015-01-06 12:23:53 +08:00
|
|
|
case X86::JE_1: return X86::COND_E;
|
|
|
|
case X86::JNE_1: return X86::COND_NE;
|
|
|
|
case X86::JL_1: return X86::COND_L;
|
|
|
|
case X86::JLE_1: return X86::COND_LE;
|
|
|
|
case X86::JG_1: return X86::COND_G;
|
|
|
|
case X86::JGE_1: return X86::COND_GE;
|
|
|
|
case X86::JB_1: return X86::COND_B;
|
|
|
|
case X86::JBE_1: return X86::COND_BE;
|
|
|
|
case X86::JA_1: return X86::COND_A;
|
|
|
|
case X86::JAE_1: return X86::COND_AE;
|
|
|
|
case X86::JS_1: return X86::COND_S;
|
|
|
|
case X86::JNS_1: return X86::COND_NS;
|
|
|
|
case X86::JP_1: return X86::COND_P;
|
|
|
|
case X86::JNP_1: return X86::COND_NP;
|
|
|
|
case X86::JO_1: return X86::COND_O;
|
|
|
|
case X86::JNO_1: return X86::COND_NO;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return condition code of a SET opcode.
|
2012-07-10 02:57:12 +08:00
|
|
|
static X86::CondCode getCondFromSETOpc(unsigned Opc) {
|
|
|
|
switch (Opc) {
|
|
|
|
default: return X86::COND_INVALID;
|
|
|
|
case X86::SETAr: case X86::SETAm: return X86::COND_A;
|
|
|
|
case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
|
|
|
|
case X86::SETBr: case X86::SETBm: return X86::COND_B;
|
|
|
|
case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
|
|
|
|
case X86::SETEr: case X86::SETEm: return X86::COND_E;
|
|
|
|
case X86::SETGr: case X86::SETGm: return X86::COND_G;
|
|
|
|
case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
|
|
|
|
case X86::SETLr: case X86::SETLm: return X86::COND_L;
|
|
|
|
case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
|
|
|
|
case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
|
|
|
|
case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
|
|
|
|
case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
|
|
|
|
case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
|
|
|
|
case X86::SETOr: case X86::SETOm: return X86::COND_O;
|
|
|
|
case X86::SETPr: case X86::SETPm: return X86::COND_P;
|
|
|
|
case X86::SETSr: case X86::SETSm: return X86::COND_S;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return condition code of a CMov opcode.
|
2012-09-20 11:06:15 +08:00
|
|
|
X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
|
2012-07-10 02:57:12 +08:00
|
|
|
switch (Opc) {
|
|
|
|
default: return X86::COND_INVALID;
|
|
|
|
case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
|
|
|
|
case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
|
|
|
|
return X86::COND_A;
|
|
|
|
case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
|
|
|
|
case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
|
|
|
|
return X86::COND_AE;
|
|
|
|
case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
|
|
|
|
case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
|
|
|
|
return X86::COND_B;
|
|
|
|
case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
|
|
|
|
case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
|
|
|
|
return X86::COND_BE;
|
|
|
|
case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
|
|
|
|
case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
|
|
|
|
return X86::COND_E;
|
|
|
|
case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
|
|
|
|
case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
|
|
|
|
return X86::COND_G;
|
|
|
|
case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
|
|
|
|
case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
|
|
|
|
return X86::COND_GE;
|
|
|
|
case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
|
|
|
|
case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
|
|
|
|
return X86::COND_L;
|
|
|
|
case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
|
|
|
|
case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
|
|
|
|
return X86::COND_LE;
|
|
|
|
case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
|
|
|
|
case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
|
|
|
|
return X86::COND_NE;
|
|
|
|
case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
|
|
|
|
case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
|
|
|
|
return X86::COND_NO;
|
|
|
|
case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
|
|
|
|
case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
|
|
|
|
return X86::COND_NP;
|
|
|
|
case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
|
|
|
|
case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
|
|
|
|
return X86::COND_NS;
|
|
|
|
case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
|
|
|
|
case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
|
|
|
|
return X86::COND_O;
|
|
|
|
case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
|
|
|
|
case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
|
|
|
|
return X86::COND_P;
|
|
|
|
case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
|
|
|
|
case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
|
|
|
|
return X86::COND_S;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-10-21 01:42:20 +08:00
|
|
|
unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
|
|
|
|
switch (CC) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Illegal condition code!");
|
2015-01-06 12:23:53 +08:00
|
|
|
case X86::COND_E: return X86::JE_1;
|
|
|
|
case X86::COND_NE: return X86::JNE_1;
|
|
|
|
case X86::COND_L: return X86::JL_1;
|
|
|
|
case X86::COND_LE: return X86::JLE_1;
|
|
|
|
case X86::COND_G: return X86::JG_1;
|
|
|
|
case X86::COND_GE: return X86::JGE_1;
|
|
|
|
case X86::COND_B: return X86::JB_1;
|
|
|
|
case X86::COND_BE: return X86::JBE_1;
|
|
|
|
case X86::COND_A: return X86::JA_1;
|
|
|
|
case X86::COND_AE: return X86::JAE_1;
|
|
|
|
case X86::COND_S: return X86::JS_1;
|
|
|
|
case X86::COND_NS: return X86::JNS_1;
|
|
|
|
case X86::COND_P: return X86::JP_1;
|
|
|
|
case X86::COND_NP: return X86::JNP_1;
|
|
|
|
case X86::COND_O: return X86::JO_1;
|
|
|
|
case X86::COND_NO: return X86::JNO_1;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return the inverse of the specified condition,
|
2006-10-21 13:52:40 +08:00
|
|
|
/// e.g. turning COND_E to COND_NE.
|
|
|
|
X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
|
|
|
|
switch (CC) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Illegal condition code!");
|
2006-10-21 13:52:40 +08:00
|
|
|
case X86::COND_E: return X86::COND_NE;
|
|
|
|
case X86::COND_NE: return X86::COND_E;
|
|
|
|
case X86::COND_L: return X86::COND_GE;
|
|
|
|
case X86::COND_LE: return X86::COND_G;
|
|
|
|
case X86::COND_G: return X86::COND_LE;
|
|
|
|
case X86::COND_GE: return X86::COND_L;
|
|
|
|
case X86::COND_B: return X86::COND_AE;
|
|
|
|
case X86::COND_BE: return X86::COND_A;
|
|
|
|
case X86::COND_A: return X86::COND_BE;
|
|
|
|
case X86::COND_AE: return X86::COND_B;
|
|
|
|
case X86::COND_S: return X86::COND_NS;
|
|
|
|
case X86::COND_NS: return X86::COND_S;
|
|
|
|
case X86::COND_P: return X86::COND_NP;
|
|
|
|
case X86::COND_NP: return X86::COND_P;
|
|
|
|
case X86::COND_O: return X86::COND_NO;
|
|
|
|
case X86::COND_NO: return X86::COND_O;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Assuming the flags are set by MI(a,b), return the condition code if we
|
|
|
|
/// modify the instructions such that flags are set by MI(b,a).
|
2012-07-13 21:25:15 +08:00
|
|
|
static X86::CondCode getSwappedCondition(X86::CondCode CC) {
|
2012-07-10 02:57:12 +08:00
|
|
|
switch (CC) {
|
|
|
|
default: return X86::COND_INVALID;
|
|
|
|
case X86::COND_E: return X86::COND_E;
|
|
|
|
case X86::COND_NE: return X86::COND_NE;
|
|
|
|
case X86::COND_L: return X86::COND_G;
|
|
|
|
case X86::COND_LE: return X86::COND_GE;
|
|
|
|
case X86::COND_G: return X86::COND_L;
|
|
|
|
case X86::COND_GE: return X86::COND_LE;
|
|
|
|
case X86::COND_B: return X86::COND_A;
|
|
|
|
case X86::COND_BE: return X86::COND_AE;
|
|
|
|
case X86::COND_A: return X86::COND_B;
|
|
|
|
case X86::COND_AE: return X86::COND_BE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return a set opcode for the given condition and
|
2012-07-10 02:57:12 +08:00
|
|
|
/// whether it has memory operand.
|
2014-06-17 07:58:24 +08:00
|
|
|
unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
|
2012-08-21 16:23:21 +08:00
|
|
|
static const uint16_t Opc[16][2] = {
|
2012-07-10 02:57:12 +08:00
|
|
|
{ X86::SETAr, X86::SETAm },
|
|
|
|
{ X86::SETAEr, X86::SETAEm },
|
|
|
|
{ X86::SETBr, X86::SETBm },
|
|
|
|
{ X86::SETBEr, X86::SETBEm },
|
|
|
|
{ X86::SETEr, X86::SETEm },
|
|
|
|
{ X86::SETGr, X86::SETGm },
|
|
|
|
{ X86::SETGEr, X86::SETGEm },
|
|
|
|
{ X86::SETLr, X86::SETLm },
|
|
|
|
{ X86::SETLEr, X86::SETLEm },
|
|
|
|
{ X86::SETNEr, X86::SETNEm },
|
|
|
|
{ X86::SETNOr, X86::SETNOm },
|
|
|
|
{ X86::SETNPr, X86::SETNPm },
|
|
|
|
{ X86::SETNSr, X86::SETNSm },
|
|
|
|
{ X86::SETOr, X86::SETOm },
|
|
|
|
{ X86::SETPr, X86::SETPm },
|
|
|
|
{ X86::SETSr, X86::SETSm }
|
|
|
|
};
|
|
|
|
|
2014-06-17 07:58:24 +08:00
|
|
|
assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
|
2012-07-10 02:57:12 +08:00
|
|
|
return Opc[CC][HasMemoryOperand ? 1 : 0];
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return a cmov opcode for the given condition,
|
2012-07-10 02:57:12 +08:00
|
|
|
/// register size in bytes, and operand type.
|
2014-06-24 05:55:36 +08:00
|
|
|
unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
|
|
|
|
bool HasMemoryOperand) {
|
2012-08-21 16:23:21 +08:00
|
|
|
static const uint16_t Opc[32][3] = {
|
2012-07-04 08:09:58 +08:00
|
|
|
{ X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
|
|
|
|
{ X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
|
|
|
|
{ X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
|
|
|
|
{ X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
|
|
|
|
{ X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
|
|
|
|
{ X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
|
|
|
|
{ X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
|
|
|
|
{ X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
|
|
|
|
{ X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
|
|
|
|
{ X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
|
|
|
|
{ X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
|
|
|
|
{ X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
|
|
|
|
{ X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
|
|
|
|
{ X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
|
|
|
|
{ X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
|
2012-07-10 02:57:12 +08:00
|
|
|
{ X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
|
|
|
|
{ X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
|
|
|
|
{ X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
|
|
|
|
{ X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
|
|
|
|
{ X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
|
|
|
|
{ X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
|
|
|
|
{ X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
|
|
|
|
{ X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
|
|
|
|
{ X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
|
|
|
|
{ X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
|
|
|
|
{ X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
|
|
|
|
{ X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
|
|
|
|
{ X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
|
|
|
|
{ X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
|
|
|
|
{ X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
|
|
|
|
{ X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
|
|
|
|
{ X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
|
2012-07-04 08:09:58 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
assert(CC < 16 && "Can only handle standard cond codes");
|
2012-07-10 02:57:12 +08:00
|
|
|
unsigned Idx = HasMemoryOperand ? 16+CC : CC;
|
2012-07-04 08:09:58 +08:00
|
|
|
switch(RegBytes) {
|
|
|
|
default: llvm_unreachable("Illegal register size!");
|
2012-07-10 02:57:12 +08:00
|
|
|
case 2: return Opc[Idx][0];
|
|
|
|
case 4: return Opc[Idx][1];
|
|
|
|
case 8: return Opc[Idx][2];
|
2012-07-04 08:09:58 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-06-15 06:03:45 +08:00
|
|
|
bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
|
2011-12-07 15:15:52 +08:00
|
|
|
if (!MI->isTerminator()) return false;
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2008-01-07 09:56:04 +08:00
|
|
|
// Conditional branch is a special case.
|
2011-12-07 15:15:52 +08:00
|
|
|
if (MI->isBranch() && !MI->isBarrier())
|
2008-01-07 09:56:04 +08:00
|
|
|
return true;
|
2011-12-07 15:15:52 +08:00
|
|
|
if (!MI->isPredicable())
|
2008-01-07 09:56:04 +08:00
|
|
|
return true;
|
|
|
|
return !isPredicated(MI);
|
2007-06-15 06:03:45 +08:00
|
|
|
}
|
2006-10-21 13:52:40 +08:00
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
|
2006-10-21 01:42:20 +08:00
|
|
|
MachineBasicBlock *&TBB,
|
|
|
|
MachineBasicBlock *&FBB,
|
2009-02-09 15:14:22 +08:00
|
|
|
SmallVectorImpl<MachineOperand> &Cond,
|
|
|
|
bool AllowModify) const {
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// Start from the bottom of the block and work up, examining the
|
|
|
|
// terminator instructions.
|
2006-10-21 01:42:20 +08:00
|
|
|
MachineBasicBlock::iterator I = MBB.end();
|
2010-04-14 02:50:27 +08:00
|
|
|
MachineBasicBlock::iterator UnCondBrIter = MBB.end();
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
while (I != MBB.begin()) {
|
|
|
|
--I;
|
2010-04-02 09:38:09 +08:00
|
|
|
if (I->isDebugValue())
|
|
|
|
continue;
|
2009-12-14 14:51:19 +08:00
|
|
|
|
|
|
|
// Working from the bottom, when we see a non-terminator instruction, we're
|
|
|
|
// done.
|
2010-07-17 01:41:44 +08:00
|
|
|
if (!isUnpredicatedTerminator(I))
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
break;
|
2009-12-14 14:51:19 +08:00
|
|
|
|
|
|
|
// A terminator that isn't a branch can't easily be handled by this
|
|
|
|
// analysis.
|
2011-12-07 15:15:52 +08:00
|
|
|
if (!I->isBranch())
|
2006-10-21 01:42:20 +08:00
|
|
|
return true;
|
2009-12-14 14:51:19 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// Handle unconditional branches.
|
2015-01-06 12:23:53 +08:00
|
|
|
if (I->getOpcode() == X86::JMP_1) {
|
2010-04-14 02:50:27 +08:00
|
|
|
UnCondBrIter = I;
|
|
|
|
|
2009-02-09 15:14:22 +08:00
|
|
|
if (!AllowModify) {
|
|
|
|
TBB = I->getOperand(0).getMBB();
|
2009-05-08 14:34:09 +08:00
|
|
|
continue;
|
2009-02-09 15:14:22 +08:00
|
|
|
}
|
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// If the block has any instructions after a JMP, delete them.
|
2014-03-02 20:27:27 +08:00
|
|
|
while (std::next(I) != MBB.end())
|
|
|
|
std::next(I)->eraseFromParent();
|
2009-12-14 14:51:19 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
Cond.clear();
|
2014-04-25 13:30:21 +08:00
|
|
|
FBB = nullptr;
|
2009-12-14 14:51:19 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// Delete the JMP if it's equivalent to a fall-through.
|
|
|
|
if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
|
2014-04-25 13:30:21 +08:00
|
|
|
TBB = nullptr;
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
I->eraseFromParent();
|
|
|
|
I = MBB.end();
|
2010-04-14 02:50:27 +08:00
|
|
|
UnCondBrIter = MBB.end();
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
continue;
|
|
|
|
}
|
2009-12-14 14:51:19 +08:00
|
|
|
|
2010-04-14 02:50:27 +08:00
|
|
|
// TBB is used to indicate the unconditional destination.
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
TBB = I->getOperand(0).getMBB();
|
|
|
|
continue;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
2009-12-14 14:51:19 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// Handle conditional branches.
|
2012-07-10 02:57:12 +08:00
|
|
|
X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
|
2006-10-21 01:42:20 +08:00
|
|
|
if (BranchCode == X86::COND_INVALID)
|
|
|
|
return true; // Can't handle indirect branch.
|
2009-12-14 14:51:19 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// Working from the bottom, handle the first conditional branch.
|
|
|
|
if (Cond.empty()) {
|
2010-04-14 02:50:27 +08:00
|
|
|
MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
|
|
|
|
if (AllowModify && UnCondBrIter != MBB.end() &&
|
|
|
|
MBB.isLayoutSuccessor(TargetBB)) {
|
|
|
|
// If we can modify the code and it ends in something like:
|
|
|
|
//
|
|
|
|
// jCC L1
|
|
|
|
// jmp L2
|
|
|
|
// L1:
|
|
|
|
// ...
|
|
|
|
// L2:
|
|
|
|
//
|
|
|
|
// Then we can change this to:
|
|
|
|
//
|
|
|
|
// jnCC L2
|
|
|
|
// L1:
|
|
|
|
// ...
|
|
|
|
// L2:
|
|
|
|
//
|
|
|
|
// Which is a bit more efficient.
|
|
|
|
// We conditionally jump to the fall-through block.
|
|
|
|
BranchCode = GetOppositeBranchCondition(BranchCode);
|
|
|
|
unsigned JNCC = GetCondBranchFromCond(BranchCode);
|
|
|
|
MachineBasicBlock::iterator OldInst = I;
|
|
|
|
|
|
|
|
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
|
|
|
|
.addMBB(UnCondBrIter->getOperand(0).getMBB());
|
2015-01-06 12:23:53 +08:00
|
|
|
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
|
2010-04-14 02:50:27 +08:00
|
|
|
.addMBB(TargetBB);
|
|
|
|
|
|
|
|
OldInst->eraseFromParent();
|
|
|
|
UnCondBrIter->eraseFromParent();
|
|
|
|
|
|
|
|
// Restart the analysis.
|
|
|
|
UnCondBrIter = MBB.end();
|
|
|
|
I = MBB.end();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
FBB = TBB;
|
|
|
|
TBB = I->getOperand(0).getMBB();
|
|
|
|
Cond.push_back(MachineOperand::CreateImm(BranchCode));
|
|
|
|
continue;
|
|
|
|
}
|
2009-12-14 14:51:19 +08:00
|
|
|
|
|
|
|
// Handle subsequent conditional branches. Only handle the case where all
|
|
|
|
// conditional branches branch to the same destination and their condition
|
|
|
|
// opcodes fit one of the special multi-branch idioms.
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
assert(Cond.size() == 1);
|
|
|
|
assert(TBB);
|
2009-12-14 14:51:19 +08:00
|
|
|
|
|
|
|
// Only handle the case where all conditional branches branch to the same
|
|
|
|
// destination.
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
if (TBB != I->getOperand(0).getMBB())
|
|
|
|
return true;
|
2009-12-14 14:51:19 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// If the conditions are the same, we can leave them alone.
|
2009-12-14 14:51:19 +08:00
|
|
|
X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
if (OldBranchCode == BranchCode)
|
|
|
|
continue;
|
2009-12-14 14:51:19 +08:00
|
|
|
|
|
|
|
// If they differ, see if they fit one of the known patterns. Theoretically,
|
|
|
|
// we could handle more patterns here, but we shouldn't expect to see them
|
|
|
|
// if instruction selection has done a reasonable job.
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
if ((OldBranchCode == X86::COND_NP &&
|
|
|
|
BranchCode == X86::COND_E) ||
|
|
|
|
(OldBranchCode == X86::COND_E &&
|
|
|
|
BranchCode == X86::COND_NP))
|
|
|
|
BranchCode = X86::COND_NP_OR_E;
|
|
|
|
else if ((OldBranchCode == X86::COND_P &&
|
|
|
|
BranchCode == X86::COND_NE) ||
|
|
|
|
(OldBranchCode == X86::COND_NE &&
|
|
|
|
BranchCode == X86::COND_P))
|
|
|
|
BranchCode = X86::COND_NE_OR_P;
|
|
|
|
else
|
|
|
|
return true;
|
2009-12-14 14:51:19 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
// Update the MachineOperand.
|
|
|
|
Cond[0].setImm(BranchCode);
|
2007-06-14 01:59:52 +08:00
|
|
|
}
|
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
return false;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
|
|
|
|
2007-05-18 08:18:17 +08:00
|
|
|
unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
|
2006-10-21 01:42:20 +08:00
|
|
|
MachineBasicBlock::iterator I = MBB.end();
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
unsigned Count = 0;
|
|
|
|
|
|
|
|
while (I != MBB.begin()) {
|
|
|
|
--I;
|
2010-04-02 09:38:09 +08:00
|
|
|
if (I->isDebugValue())
|
|
|
|
continue;
|
2015-01-06 12:23:53 +08:00
|
|
|
if (I->getOpcode() != X86::JMP_1 &&
|
2012-07-10 02:57:12 +08:00
|
|
|
getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
break;
|
|
|
|
// Remove the branch.
|
|
|
|
I->eraseFromParent();
|
|
|
|
I = MBB.end();
|
|
|
|
++Count;
|
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
return Count;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
|
|
|
|
2007-05-18 08:18:17 +08:00
|
|
|
unsigned
|
|
|
|
X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
|
|
|
|
MachineBasicBlock *FBB,
|
2010-06-18 06:43:56 +08:00
|
|
|
const SmallVectorImpl<MachineOperand> &Cond,
|
|
|
|
DebugLoc DL) const {
|
2006-10-21 01:42:20 +08:00
|
|
|
// Shouldn't be a fall through.
|
|
|
|
assert(TBB && "InsertBranch must not be told to insert a fallthrough");
|
2006-10-21 13:34:23 +08:00
|
|
|
assert((Cond.size() == 1 || Cond.size() == 0) &&
|
|
|
|
"X86 branch conditions have one component!");
|
|
|
|
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
if (Cond.empty()) {
|
|
|
|
// Unconditional branch?
|
|
|
|
assert(!FBB && "Unconditional branch with multiple successors!");
|
2015-01-06 12:23:53 +08:00
|
|
|
BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
|
2007-05-18 08:18:17 +08:00
|
|
|
return 1;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
|
|
|
|
// Conditional branch.
|
|
|
|
unsigned Count = 0;
|
|
|
|
X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
|
|
|
|
switch (CC) {
|
|
|
|
case X86::COND_NP_OR_E:
|
|
|
|
// Synthesize NP_OR_E with two branches.
|
2015-01-06 12:23:53 +08:00
|
|
|
BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
|
2010-03-05 08:33:59 +08:00
|
|
|
++Count;
|
2015-01-06 12:23:53 +08:00
|
|
|
BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB);
|
2010-03-05 08:33:59 +08:00
|
|
|
++Count;
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
break;
|
|
|
|
case X86::COND_NE_OR_P:
|
|
|
|
// Synthesize NE_OR_P with two branches.
|
2015-01-06 12:23:53 +08:00
|
|
|
BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
|
2010-03-05 08:33:59 +08:00
|
|
|
++Count;
|
2015-01-06 12:23:53 +08:00
|
|
|
BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
|
2010-03-05 08:33:59 +08:00
|
|
|
++Count;
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
break;
|
2010-03-05 08:33:59 +08:00
|
|
|
default: {
|
|
|
|
unsigned Opc = GetCondBranchFromCond(CC);
|
2010-06-18 06:43:56 +08:00
|
|
|
BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
|
2010-03-05 08:33:59 +08:00
|
|
|
++Count;
|
|
|
|
}
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
}
|
|
|
|
if (FBB) {
|
|
|
|
// Two-way Conditional branch. Insert the second branch.
|
2015-01-06 12:23:53 +08:00
|
|
|
BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
++Count;
|
|
|
|
}
|
|
|
|
return Count;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
|
|
|
|
2012-07-04 08:09:58 +08:00
|
|
|
bool X86InstrInfo::
|
|
|
|
canInsertSelect(const MachineBasicBlock &MBB,
|
|
|
|
const SmallVectorImpl<MachineOperand> &Cond,
|
|
|
|
unsigned TrueReg, unsigned FalseReg,
|
|
|
|
int &CondCycles, int &TrueCycles, int &FalseCycles) const {
|
|
|
|
// Not all subtargets have cmov instructions.
|
2014-06-11 06:34:31 +08:00
|
|
|
if (!Subtarget.hasCMov())
|
2012-07-04 08:09:58 +08:00
|
|
|
return false;
|
|
|
|
if (Cond.size() != 1)
|
|
|
|
return false;
|
|
|
|
// We cannot do the composite conditions, at least not in SSA form.
|
|
|
|
if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Check register classes.
|
|
|
|
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
|
|
|
|
const TargetRegisterClass *RC =
|
|
|
|
RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
|
|
|
|
if (!RC)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// We have cmov instructions for 16, 32, and 64 bit general purpose registers.
|
|
|
|
if (X86::GR16RegClass.hasSubClassEq(RC) ||
|
|
|
|
X86::GR32RegClass.hasSubClassEq(RC) ||
|
|
|
|
X86::GR64RegClass.hasSubClassEq(RC)) {
|
|
|
|
// This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
|
|
|
|
// Bridge. Probably Ivy Bridge as well.
|
|
|
|
CondCycles = 2;
|
|
|
|
TrueCycles = 2;
|
|
|
|
FalseCycles = 2;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Can't do vectors.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator I, DebugLoc DL,
|
|
|
|
unsigned DstReg,
|
|
|
|
const SmallVectorImpl<MachineOperand> &Cond,
|
|
|
|
unsigned TrueReg, unsigned FalseReg) const {
|
|
|
|
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
|
|
|
|
assert(Cond.size() == 1 && "Invalid Cond array");
|
|
|
|
unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
|
2012-07-10 02:57:12 +08:00
|
|
|
MRI.getRegClass(DstReg)->getSize(),
|
|
|
|
false/*HasMemoryOperand*/);
|
2012-07-04 08:09:58 +08:00
|
|
|
BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Test if the given register is a physical h register.
|
2009-04-15 08:04:23 +08:00
|
|
|
static bool isHReg(unsigned Reg) {
|
2009-04-28 00:41:36 +08:00
|
|
|
return X86::GR8_ABCD_HRegClass.contains(Reg);
|
2009-04-15 08:04:23 +08:00
|
|
|
}
|
|
|
|
|
2010-08-27 22:43:06 +08:00
|
|
|
// Try and copy between VR128/VR64 and GR64 registers.
|
2011-09-14 10:36:58 +08:00
|
|
|
static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
|
2014-06-11 06:34:31 +08:00
|
|
|
const X86Subtarget &Subtarget) {
|
2013-08-11 15:55:09 +08:00
|
|
|
|
2010-08-27 22:43:06 +08:00
|
|
|
// SrcReg(VR128) -> DestReg(GR64)
|
|
|
|
// SrcReg(VR64) -> DestReg(GR64)
|
|
|
|
// SrcReg(GR64) -> DestReg(VR128)
|
|
|
|
// SrcReg(GR64) -> DestReg(VR64)
|
|
|
|
|
2013-08-11 15:55:09 +08:00
|
|
|
bool HasAVX = Subtarget.hasAVX();
|
2013-08-18 21:08:57 +08:00
|
|
|
bool HasAVX512 = Subtarget.hasAVX512();
|
2010-08-27 22:43:06 +08:00
|
|
|
if (X86::GR64RegClass.contains(DestReg)) {
|
2013-08-18 21:08:57 +08:00
|
|
|
if (X86::VR128XRegClass.contains(SrcReg))
|
2010-08-27 22:43:06 +08:00
|
|
|
// Copy from a VR128 register to a GR64 register.
|
2013-08-18 21:08:57 +08:00
|
|
|
return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
|
|
|
|
X86::MOVPQIto64rr);
|
2012-08-21 16:29:51 +08:00
|
|
|
if (X86::VR64RegClass.contains(SrcReg))
|
2010-08-27 22:43:06 +08:00
|
|
|
// Copy from a VR64 register to a GR64 register.
|
|
|
|
return X86::MOVSDto64rr;
|
|
|
|
} else if (X86::GR64RegClass.contains(SrcReg)) {
|
|
|
|
// Copy from a GR64 register to a VR128 register.
|
2013-08-18 21:08:57 +08:00
|
|
|
if (X86::VR128XRegClass.contains(DestReg))
|
|
|
|
return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
|
|
|
|
X86::MOV64toPQIrr);
|
2010-08-27 22:43:06 +08:00
|
|
|
// Copy from a GR64 register to a VR64 register.
|
2012-08-21 16:29:51 +08:00
|
|
|
if (X86::VR64RegClass.contains(DestReg))
|
2010-08-27 22:43:06 +08:00
|
|
|
return X86::MOV64toSDrr;
|
|
|
|
}
|
|
|
|
|
2011-09-23 06:45:24 +08:00
|
|
|
// SrcReg(FR32) -> DestReg(GR32)
|
|
|
|
// SrcReg(GR32) -> DestReg(FR32)
|
|
|
|
|
2013-08-18 21:08:57 +08:00
|
|
|
if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
|
2012-08-21 16:29:51 +08:00
|
|
|
// Copy from a FR32 register to a GR32 register.
|
2013-08-18 21:08:57 +08:00
|
|
|
return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
|
2011-09-23 06:45:24 +08:00
|
|
|
|
2013-08-18 21:08:57 +08:00
|
|
|
if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
|
2012-08-21 16:29:51 +08:00
|
|
|
// Copy from a GR32 register to a FR32 register.
|
2013-08-18 21:08:57 +08:00
|
|
|
return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
|
2010-08-27 22:43:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-12-16 21:52:35 +08:00
|
|
|
inline static bool MaskRegClassContains(unsigned Reg) {
|
|
|
|
return X86::VK8RegClass.contains(Reg) ||
|
|
|
|
X86::VK16RegClass.contains(Reg) ||
|
2014-07-23 22:49:42 +08:00
|
|
|
X86::VK32RegClass.contains(Reg) ||
|
|
|
|
X86::VK64RegClass.contains(Reg) ||
|
2013-12-16 21:52:35 +08:00
|
|
|
X86::VK1RegClass.contains(Reg);
|
|
|
|
}
|
2013-08-11 15:55:09 +08:00
|
|
|
static
|
|
|
|
unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
|
|
|
|
if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
|
|
|
|
X86::VR256XRegClass.contains(DestReg, SrcReg) ||
|
|
|
|
X86::VR512RegClass.contains(DestReg, SrcReg)) {
|
|
|
|
DestReg = get512BitSuperRegister(DestReg);
|
|
|
|
SrcReg = get512BitSuperRegister(SrcReg);
|
|
|
|
return X86::VMOVAPSZrr;
|
|
|
|
}
|
2013-12-16 21:52:35 +08:00
|
|
|
if (MaskRegClassContains(DestReg) &&
|
|
|
|
MaskRegClassContains(SrcReg))
|
2013-08-11 15:55:09 +08:00
|
|
|
return X86::KMOVWkk;
|
2013-12-16 21:52:35 +08:00
|
|
|
if (MaskRegClassContains(DestReg) &&
|
2013-12-10 19:58:35 +08:00
|
|
|
(X86::GR32RegClass.contains(SrcReg) ||
|
|
|
|
X86::GR16RegClass.contains(SrcReg) ||
|
|
|
|
X86::GR8RegClass.contains(SrcReg))) {
|
|
|
|
SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
|
|
|
|
return X86::KMOVWkr;
|
|
|
|
}
|
|
|
|
if ((X86::GR32RegClass.contains(DestReg) ||
|
|
|
|
X86::GR16RegClass.contains(DestReg) ||
|
|
|
|
X86::GR8RegClass.contains(DestReg)) &&
|
2013-12-16 21:52:35 +08:00
|
|
|
MaskRegClassContains(SrcReg)) {
|
2013-12-10 19:58:35 +08:00
|
|
|
DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
|
|
|
|
return X86::KMOVWrk;
|
|
|
|
}
|
2013-08-11 15:55:09 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-07-09 03:46:25 +08:00
|
|
|
void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator MI, DebugLoc DL,
|
|
|
|
unsigned DestReg, unsigned SrcReg,
|
|
|
|
bool KillSrc) const {
|
|
|
|
// First deal with the normal symmetric copies.
|
2014-06-11 06:34:31 +08:00
|
|
|
bool HasAVX = Subtarget.hasAVX();
|
|
|
|
bool HasAVX512 = Subtarget.hasAVX512();
|
2013-08-11 15:55:09 +08:00
|
|
|
unsigned Opc = 0;
|
2010-07-09 03:46:25 +08:00
|
|
|
if (X86::GR64RegClass.contains(DestReg, SrcReg))
|
|
|
|
Opc = X86::MOV64rr;
|
|
|
|
else if (X86::GR32RegClass.contains(DestReg, SrcReg))
|
|
|
|
Opc = X86::MOV32rr;
|
|
|
|
else if (X86::GR16RegClass.contains(DestReg, SrcReg))
|
|
|
|
Opc = X86::MOV16rr;
|
|
|
|
else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
|
|
|
|
// Copying to or from a physical H register on x86-64 requires a NOREX
|
|
|
|
// move. Otherwise use a normal move.
|
|
|
|
if ((isHReg(DestReg) || isHReg(SrcReg)) &&
|
2014-06-11 06:34:31 +08:00
|
|
|
Subtarget.is64Bit()) {
|
2010-07-09 03:46:25 +08:00
|
|
|
Opc = X86::MOV8rr_NOREX;
|
2011-10-08 04:15:54 +08:00
|
|
|
// Both operands must be encodable without an REX prefix.
|
|
|
|
assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
|
|
|
|
"8-bit H register can not be copied outside GR8_NOREX");
|
|
|
|
} else
|
2010-07-09 03:46:25 +08:00
|
|
|
Opc = X86::MOV8rr;
|
2013-08-11 15:55:09 +08:00
|
|
|
}
|
|
|
|
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
|
|
|
|
Opc = X86::MMX_MOVQ64rr;
|
|
|
|
else if (HasAVX512)
|
|
|
|
Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
|
|
|
|
else if (X86::VR128RegClass.contains(DestReg, SrcReg))
|
2011-09-14 10:36:58 +08:00
|
|
|
Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
|
2011-07-15 02:50:58 +08:00
|
|
|
else if (X86::VR256RegClass.contains(DestReg, SrcReg))
|
|
|
|
Opc = X86::VMOVAPSYrr;
|
2013-08-11 15:55:09 +08:00
|
|
|
if (!Opc)
|
2014-06-11 06:34:31 +08:00
|
|
|
Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
|
2010-07-09 03:46:25 +08:00
|
|
|
|
|
|
|
if (Opc) {
|
|
|
|
BuildMI(MBB, MI, DL, get(Opc), DestReg)
|
|
|
|
.addReg(SrcReg, getKillRegState(KillSrc));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Moving EFLAGS to / from another register requires a push and a pop.
|
2012-12-22 07:48:49 +08:00
|
|
|
// Notice that we have to adjust the stack if we don't want to clobber the
|
2014-08-06 07:27:34 +08:00
|
|
|
// first frame index. See X86FrameLowering.cpp - clobbersTheStack.
|
2010-07-09 03:46:25 +08:00
|
|
|
if (SrcReg == X86::EFLAGS) {
|
|
|
|
if (X86::GR64RegClass.contains(DestReg)) {
|
|
|
|
BuildMI(MBB, MI, DL, get(X86::PUSHF64));
|
|
|
|
BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
|
|
|
|
return;
|
2012-08-21 16:29:51 +08:00
|
|
|
}
|
|
|
|
if (X86::GR32RegClass.contains(DestReg)) {
|
2010-07-09 03:46:25 +08:00
|
|
|
BuildMI(MBB, MI, DL, get(X86::PUSHF32));
|
|
|
|
BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (DestReg == X86::EFLAGS) {
|
|
|
|
if (X86::GR64RegClass.contains(SrcReg)) {
|
|
|
|
BuildMI(MBB, MI, DL, get(X86::PUSH64r))
|
|
|
|
.addReg(SrcReg, getKillRegState(KillSrc));
|
|
|
|
BuildMI(MBB, MI, DL, get(X86::POPF64));
|
|
|
|
return;
|
2012-08-21 16:29:51 +08:00
|
|
|
}
|
|
|
|
if (X86::GR32RegClass.contains(SrcReg)) {
|
2010-07-09 03:46:25 +08:00
|
|
|
BuildMI(MBB, MI, DL, get(X86::PUSH32r))
|
|
|
|
.addReg(SrcReg, getKillRegState(KillSrc));
|
|
|
|
BuildMI(MBB, MI, DL, get(X86::POPF32));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
|
|
|
|
<< " to " << RI.getName(DestReg) << '\n');
|
|
|
|
llvm_unreachable("Cannot emit physreg copy instruction");
|
|
|
|
}
|
|
|
|
|
2010-06-13 04:13:29 +08:00
|
|
|
static unsigned getLoadStoreRegOpcode(unsigned Reg,
|
|
|
|
const TargetRegisterClass *RC,
|
|
|
|
bool isStackAligned,
|
2014-06-11 06:34:31 +08:00
|
|
|
const X86Subtarget &STI,
|
2010-06-13 04:13:29 +08:00
|
|
|
bool load) {
|
2014-06-11 06:34:31 +08:00
|
|
|
if (STI.hasAVX512()) {
|
2013-10-15 06:18:56 +08:00
|
|
|
if (X86::VK8RegClass.hasSubClassEq(RC) ||
|
2013-08-18 21:08:57 +08:00
|
|
|
X86::VK16RegClass.hasSubClassEq(RC))
|
|
|
|
return load ? X86::KMOVWkm : X86::KMOVWmk;
|
2013-10-02 20:20:42 +08:00
|
|
|
if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
|
2013-08-18 21:08:57 +08:00
|
|
|
return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
|
2013-10-02 20:20:42 +08:00
|
|
|
if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
|
2013-08-18 21:08:57 +08:00
|
|
|
return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
|
2013-10-02 20:20:42 +08:00
|
|
|
if (X86::VR512RegClass.hasSubClassEq(RC))
|
2013-08-18 21:08:57 +08:00
|
|
|
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
|
|
|
|
}
|
|
|
|
|
2014-06-11 06:34:31 +08:00
|
|
|
bool HasAVX = STI.hasAVX();
|
2011-06-01 23:32:10 +08:00
|
|
|
switch (RC->getSize()) {
|
2010-07-12 11:43:04 +08:00
|
|
|
default:
|
2011-06-01 23:32:10 +08:00
|
|
|
llvm_unreachable("Unknown spill size");
|
|
|
|
case 1:
|
|
|
|
assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
|
2014-06-11 06:34:31 +08:00
|
|
|
if (STI.is64Bit())
|
2011-06-01 23:32:10 +08:00
|
|
|
// Copying to or from a physical H register on x86-64 requires a NOREX
|
|
|
|
// move. Otherwise use a normal move.
|
|
|
|
if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
|
|
|
|
return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
|
|
|
|
return load ? X86::MOV8rm : X86::MOV8mr;
|
|
|
|
case 2:
|
|
|
|
assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
|
|
|
|
return load ? X86::MOV16rm : X86::MOV16mr;
|
|
|
|
case 4:
|
|
|
|
if (X86::GR32RegClass.hasSubClassEq(RC))
|
|
|
|
return load ? X86::MOV32rm : X86::MOV32mr;
|
|
|
|
if (X86::FR32RegClass.hasSubClassEq(RC))
|
2011-09-14 10:36:58 +08:00
|
|
|
return load ?
|
|
|
|
(HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
|
|
|
|
(HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
|
2011-06-01 23:32:10 +08:00
|
|
|
if (X86::RFP32RegClass.hasSubClassEq(RC))
|
|
|
|
return load ? X86::LD_Fp32m : X86::ST_Fp32m;
|
|
|
|
llvm_unreachable("Unknown 4-byte regclass");
|
|
|
|
case 8:
|
|
|
|
if (X86::GR64RegClass.hasSubClassEq(RC))
|
|
|
|
return load ? X86::MOV64rm : X86::MOV64mr;
|
|
|
|
if (X86::FR64RegClass.hasSubClassEq(RC))
|
2011-09-14 10:36:58 +08:00
|
|
|
return load ?
|
|
|
|
(HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
|
|
|
|
(HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
|
2011-06-01 23:32:10 +08:00
|
|
|
if (X86::VR64RegClass.hasSubClassEq(RC))
|
|
|
|
return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
|
|
|
|
if (X86::RFP64RegClass.hasSubClassEq(RC))
|
|
|
|
return load ? X86::LD_Fp64m : X86::ST_Fp64m;
|
|
|
|
llvm_unreachable("Unknown 8-byte regclass");
|
|
|
|
case 10:
|
|
|
|
assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
|
2010-06-13 04:13:29 +08:00
|
|
|
return load ? X86::LD_Fp80m : X86::ST_FpP80m;
|
2011-08-31 11:04:09 +08:00
|
|
|
case 16: {
|
2013-11-14 19:29:27 +08:00
|
|
|
assert((X86::VR128RegClass.hasSubClassEq(RC) ||
|
|
|
|
X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
|
2008-07-19 14:30:51 +08:00
|
|
|
// If stack is realigned we can use aligned stores.
|
2010-06-13 04:13:29 +08:00
|
|
|
if (isStackAligned)
|
2011-08-31 11:04:09 +08:00
|
|
|
return load ?
|
|
|
|
(HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
|
|
|
|
(HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
|
2010-06-13 04:13:29 +08:00
|
|
|
else
|
2011-08-31 11:04:09 +08:00
|
|
|
return load ?
|
|
|
|
(HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
|
|
|
|
(HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
|
|
|
|
}
|
2011-07-15 02:50:58 +08:00
|
|
|
case 32:
|
2013-11-14 19:29:27 +08:00
|
|
|
assert((X86::VR256RegClass.hasSubClassEq(RC) ||
|
|
|
|
X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
|
2011-07-15 02:50:58 +08:00
|
|
|
// If stack is realigned we can use aligned stores.
|
|
|
|
if (isStackAligned)
|
|
|
|
return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
|
|
|
|
else
|
|
|
|
return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
|
2013-08-18 21:08:57 +08:00
|
|
|
case 64:
|
|
|
|
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
|
|
|
|
if (isStackAligned)
|
|
|
|
return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
|
|
|
|
else
|
|
|
|
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
|
2008-01-02 05:11:32 +08:00
|
|
|
}
|
2010-06-13 04:13:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned getStoreRegOpcode(unsigned SrcReg,
|
|
|
|
const TargetRegisterClass *RC,
|
|
|
|
bool isStackAligned,
|
2014-06-11 06:34:31 +08:00
|
|
|
const X86Subtarget &STI) {
|
|
|
|
return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
|
2010-06-13 04:13:29 +08:00
|
|
|
}
|
|
|
|
|
2008-01-02 05:11:32 +08:00
|
|
|
|
2010-06-13 04:13:29 +08:00
|
|
|
static unsigned getLoadRegOpcode(unsigned DestReg,
|
|
|
|
const TargetRegisterClass *RC,
|
|
|
|
bool isStackAligned,
|
2014-06-11 06:34:31 +08:00
|
|
|
const X86Subtarget &STI) {
|
|
|
|
return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
|
2008-01-02 05:11:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
|
|
|
MachineBasicBlock::iterator MI,
|
|
|
|
unsigned SrcReg, bool isKill, int FrameIdx,
|
2010-05-07 03:06:44 +08:00
|
|
|
const TargetRegisterClass *RC,
|
|
|
|
const TargetRegisterInfo *TRI) const {
|
2008-07-19 14:30:51 +08:00
|
|
|
const MachineFunction &MF = *MBB.getParent();
|
2010-07-27 12:16:58 +08:00
|
|
|
assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
|
|
|
|
"Stack slot too small for store");
|
2013-08-18 21:08:57 +08:00
|
|
|
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
|
2015-02-03 01:38:43 +08:00
|
|
|
bool isAligned =
|
|
|
|
(Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
|
|
|
|
RI.canRealignStack(MF);
|
2014-06-11 06:34:31 +08:00
|
|
|
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
|
2010-01-26 08:03:12 +08:00
|
|
|
DebugLoc DL = MBB.findDebugLoc(MI);
|
2009-02-12 05:51:19 +08:00
|
|
|
addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
|
2009-05-14 05:33:08 +08:00
|
|
|
.addReg(SrcReg, getKillRegState(isKill));
|
2008-01-02 05:11:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
|
|
|
|
bool isKill,
|
|
|
|
SmallVectorImpl<MachineOperand> &Addr,
|
|
|
|
const TargetRegisterClass *RC,
|
2009-10-10 02:10:05 +08:00
|
|
|
MachineInstr::mmo_iterator MMOBegin,
|
|
|
|
MachineInstr::mmo_iterator MMOEnd,
|
2008-01-02 05:11:32 +08:00
|
|
|
SmallVectorImpl<MachineInstr*> &NewMIs) const {
|
2013-08-18 21:08:57 +08:00
|
|
|
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
|
2011-09-14 10:36:58 +08:00
|
|
|
bool isAligned = MMOBegin != MMOEnd &&
|
|
|
|
(*MMOBegin)->getAlignment() >= Alignment;
|
2014-06-11 06:34:31 +08:00
|
|
|
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
|
2010-04-03 04:16:16 +08:00
|
|
|
DebugLoc DL;
|
2009-02-13 07:08:38 +08:00
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
|
2008-01-02 05:11:32 +08:00
|
|
|
for (unsigned i = 0, e = Addr.size(); i != e; ++i)
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(Addr[i]);
|
2009-05-14 05:33:08 +08:00
|
|
|
MIB.addReg(SrcReg, getKillRegState(isKill));
|
2009-10-10 02:10:05 +08:00
|
|
|
(*MIB).setMemRefs(MMOBegin, MMOEnd);
|
2008-01-02 05:11:32 +08:00
|
|
|
NewMIs.push_back(MIB);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
2008-07-19 14:30:51 +08:00
|
|
|
MachineBasicBlock::iterator MI,
|
|
|
|
unsigned DestReg, int FrameIdx,
|
2010-05-07 03:06:44 +08:00
|
|
|
const TargetRegisterClass *RC,
|
|
|
|
const TargetRegisterInfo *TRI) const {
|
2008-07-19 14:30:51 +08:00
|
|
|
const MachineFunction &MF = *MBB.getParent();
|
2013-08-18 21:08:57 +08:00
|
|
|
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
|
2015-02-03 01:38:43 +08:00
|
|
|
bool isAligned =
|
|
|
|
(Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
|
|
|
|
RI.canRealignStack(MF);
|
2014-06-11 06:34:31 +08:00
|
|
|
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
|
2010-01-26 08:03:12 +08:00
|
|
|
DebugLoc DL = MBB.findDebugLoc(MI);
|
2009-02-12 05:51:19 +08:00
|
|
|
addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
|
2008-01-02 05:11:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
|
2008-07-03 17:09:37 +08:00
|
|
|
SmallVectorImpl<MachineOperand> &Addr,
|
|
|
|
const TargetRegisterClass *RC,
|
2009-10-10 02:10:05 +08:00
|
|
|
MachineInstr::mmo_iterator MMOBegin,
|
|
|
|
MachineInstr::mmo_iterator MMOEnd,
|
2008-01-02 05:11:32 +08:00
|
|
|
SmallVectorImpl<MachineInstr*> &NewMIs) const {
|
2013-08-18 21:08:57 +08:00
|
|
|
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
|
2011-09-14 10:36:58 +08:00
|
|
|
bool isAligned = MMOBegin != MMOEnd &&
|
|
|
|
(*MMOBegin)->getAlignment() >= Alignment;
|
2014-06-11 06:34:31 +08:00
|
|
|
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
|
2010-04-03 04:16:16 +08:00
|
|
|
DebugLoc DL;
|
2009-02-13 07:08:38 +08:00
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
|
2008-01-02 05:11:32 +08:00
|
|
|
for (unsigned i = 0, e = Addr.size(); i != e; ++i)
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(Addr[i]);
|
2009-10-10 02:10:05 +08:00
|
|
|
(*MIB).setMemRefs(MMOBegin, MMOEnd);
|
2008-01-02 05:11:32 +08:00
|
|
|
NewMIs.push_back(MIB);
|
|
|
|
}
|
|
|
|
|
2012-07-07 01:36:20 +08:00
|
|
|
bool X86InstrInfo::
|
|
|
|
analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
|
|
|
|
int &CmpMask, int &CmpValue) const {
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case X86::CMP64ri32:
|
|
|
|
case X86::CMP64ri8:
|
|
|
|
case X86::CMP32ri:
|
|
|
|
case X86::CMP32ri8:
|
|
|
|
case X86::CMP16ri:
|
|
|
|
case X86::CMP16ri8:
|
|
|
|
case X86::CMP8ri:
|
|
|
|
SrcReg = MI->getOperand(0).getReg();
|
|
|
|
SrcReg2 = 0;
|
|
|
|
CmpMask = ~0;
|
|
|
|
CmpValue = MI->getOperand(1).getImm();
|
|
|
|
return true;
|
2012-08-08 08:51:41 +08:00
|
|
|
// A SUB can be used to perform comparison.
|
|
|
|
case X86::SUB64rm:
|
|
|
|
case X86::SUB32rm:
|
|
|
|
case X86::SUB16rm:
|
|
|
|
case X86::SUB8rm:
|
|
|
|
SrcReg = MI->getOperand(1).getReg();
|
|
|
|
SrcReg2 = 0;
|
|
|
|
CmpMask = ~0;
|
|
|
|
CmpValue = 0;
|
|
|
|
return true;
|
|
|
|
case X86::SUB64rr:
|
|
|
|
case X86::SUB32rr:
|
|
|
|
case X86::SUB16rr:
|
|
|
|
case X86::SUB8rr:
|
|
|
|
SrcReg = MI->getOperand(1).getReg();
|
|
|
|
SrcReg2 = MI->getOperand(2).getReg();
|
|
|
|
CmpMask = ~0;
|
|
|
|
CmpValue = 0;
|
|
|
|
return true;
|
|
|
|
case X86::SUB64ri32:
|
|
|
|
case X86::SUB64ri8:
|
|
|
|
case X86::SUB32ri:
|
|
|
|
case X86::SUB32ri8:
|
|
|
|
case X86::SUB16ri:
|
|
|
|
case X86::SUB16ri8:
|
|
|
|
case X86::SUB8ri:
|
|
|
|
SrcReg = MI->getOperand(1).getReg();
|
|
|
|
SrcReg2 = 0;
|
|
|
|
CmpMask = ~0;
|
|
|
|
CmpValue = MI->getOperand(2).getImm();
|
|
|
|
return true;
|
2012-07-07 01:36:20 +08:00
|
|
|
case X86::CMP64rr:
|
|
|
|
case X86::CMP32rr:
|
|
|
|
case X86::CMP16rr:
|
|
|
|
case X86::CMP8rr:
|
|
|
|
SrcReg = MI->getOperand(0).getReg();
|
|
|
|
SrcReg2 = MI->getOperand(1).getReg();
|
|
|
|
CmpMask = ~0;
|
|
|
|
CmpValue = 0;
|
|
|
|
return true;
|
2012-07-19 05:40:01 +08:00
|
|
|
case X86::TEST8rr:
|
|
|
|
case X86::TEST16rr:
|
|
|
|
case X86::TEST32rr:
|
|
|
|
case X86::TEST64rr:
|
|
|
|
SrcReg = MI->getOperand(0).getReg();
|
|
|
|
if (MI->getOperand(1).getReg() != SrcReg) return false;
|
|
|
|
// Compare against zero.
|
|
|
|
SrcReg2 = 0;
|
|
|
|
CmpMask = ~0;
|
|
|
|
CmpValue = 0;
|
|
|
|
return true;
|
2012-07-07 01:36:20 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Check whether the first instruction, whose only
|
2012-07-07 01:36:20 +08:00
|
|
|
/// purpose is to update flags, can be made redundant.
|
|
|
|
/// CMPrr can be made redundant by SUBrr if the operands are the same.
|
|
|
|
/// This function can be extended later on.
|
|
|
|
/// SrcReg, SrcRegs: register operands for FlagI.
|
|
|
|
/// ImmValue: immediate for FlagI if it takes an immediate.
|
|
|
|
inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
|
|
|
|
unsigned SrcReg2, int ImmValue,
|
|
|
|
MachineInstr *OI) {
|
|
|
|
if (((FlagI->getOpcode() == X86::CMP64rr &&
|
|
|
|
OI->getOpcode() == X86::SUB64rr) ||
|
|
|
|
(FlagI->getOpcode() == X86::CMP32rr &&
|
|
|
|
OI->getOpcode() == X86::SUB32rr)||
|
|
|
|
(FlagI->getOpcode() == X86::CMP16rr &&
|
|
|
|
OI->getOpcode() == X86::SUB16rr)||
|
|
|
|
(FlagI->getOpcode() == X86::CMP8rr &&
|
|
|
|
OI->getOpcode() == X86::SUB8rr)) &&
|
|
|
|
((OI->getOperand(1).getReg() == SrcReg &&
|
|
|
|
OI->getOperand(2).getReg() == SrcReg2) ||
|
|
|
|
(OI->getOperand(1).getReg() == SrcReg2 &&
|
|
|
|
OI->getOperand(2).getReg() == SrcReg)))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (((FlagI->getOpcode() == X86::CMP64ri32 &&
|
|
|
|
OI->getOpcode() == X86::SUB64ri32) ||
|
|
|
|
(FlagI->getOpcode() == X86::CMP64ri8 &&
|
|
|
|
OI->getOpcode() == X86::SUB64ri8) ||
|
|
|
|
(FlagI->getOpcode() == X86::CMP32ri &&
|
|
|
|
OI->getOpcode() == X86::SUB32ri) ||
|
|
|
|
(FlagI->getOpcode() == X86::CMP32ri8 &&
|
|
|
|
OI->getOpcode() == X86::SUB32ri8) ||
|
|
|
|
(FlagI->getOpcode() == X86::CMP16ri &&
|
|
|
|
OI->getOpcode() == X86::SUB16ri) ||
|
|
|
|
(FlagI->getOpcode() == X86::CMP16ri8 &&
|
|
|
|
OI->getOpcode() == X86::SUB16ri8) ||
|
|
|
|
(FlagI->getOpcode() == X86::CMP8ri &&
|
|
|
|
OI->getOpcode() == X86::SUB8ri)) &&
|
|
|
|
OI->getOperand(1).getReg() == SrcReg &&
|
|
|
|
OI->getOperand(2).getImm() == ImmValue)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Check whether the definition can be converted
|
2012-07-19 05:40:01 +08:00
|
|
|
/// to remove a comparison against zero.
|
|
|
|
inline static bool isDefConvertible(MachineInstr *MI) {
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: return false;
|
2013-05-22 16:13:02 +08:00
|
|
|
|
|
|
|
// The shift instructions only modify ZF if their shift count is non-zero.
|
|
|
|
// N.B.: The processor truncates the shift count depending on the encoding.
|
|
|
|
case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
|
|
|
|
case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
|
|
|
|
return getTruncatedShiftCount(MI, 2) != 0;
|
|
|
|
|
|
|
|
// Some left shift instructions can be turned into LEA instructions but only
|
|
|
|
// if their flags aren't used. Avoid transforming such instructions.
|
|
|
|
case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
|
|
|
|
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
|
|
|
|
if (isTruncatedShiftCountForLEA(ShAmt)) return false;
|
|
|
|
return ShAmt != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
|
|
|
|
case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
|
|
|
|
return getTruncatedShiftCount(MI, 3) != 0;
|
|
|
|
|
2012-07-19 05:40:01 +08:00
|
|
|
case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
|
|
|
|
case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
|
|
|
|
case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
|
|
|
|
case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
|
|
|
|
case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
|
2012-12-17 12:55:07 +08:00
|
|
|
case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
|
2012-07-19 05:40:01 +08:00
|
|
|
case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
|
|
|
|
case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
|
|
|
|
case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
|
|
|
|
case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
|
|
|
|
case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
|
2012-12-17 12:55:07 +08:00
|
|
|
case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
|
2012-07-19 05:40:01 +08:00
|
|
|
case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
|
|
|
|
case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
|
|
|
|
case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
|
|
|
|
case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
|
|
|
|
case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
|
|
|
|
case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
|
|
|
|
case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
|
|
|
|
case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
|
|
|
|
case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
|
|
|
|
case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
|
|
|
|
case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
|
|
|
|
case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
|
|
|
|
case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
|
|
|
|
case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
|
|
|
|
case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
|
2013-05-16 06:03:08 +08:00
|
|
|
case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
|
|
|
|
case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
|
|
|
|
case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
|
|
|
|
case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
|
|
|
|
case X86::ADC32ri: case X86::ADC32ri8:
|
|
|
|
case X86::ADC32rr: case X86::ADC64ri32:
|
|
|
|
case X86::ADC64ri8: case X86::ADC64rr:
|
|
|
|
case X86::SBB32ri: case X86::SBB32ri8:
|
|
|
|
case X86::SBB32rr: case X86::SBB64ri32:
|
|
|
|
case X86::SBB64ri8: case X86::SBB64rr:
|
2012-12-17 13:12:30 +08:00
|
|
|
case X86::ANDN32rr: case X86::ANDN32rm:
|
|
|
|
case X86::ANDN64rr: case X86::ANDN64rm:
|
2013-05-16 06:03:08 +08:00
|
|
|
case X86::BEXTR32rr: case X86::BEXTR64rr:
|
|
|
|
case X86::BEXTR32rm: case X86::BEXTR64rm:
|
|
|
|
case X86::BLSI32rr: case X86::BLSI32rm:
|
|
|
|
case X86::BLSI64rr: case X86::BLSI64rm:
|
|
|
|
case X86::BLSMSK32rr:case X86::BLSMSK32rm:
|
|
|
|
case X86::BLSMSK64rr:case X86::BLSMSK64rm:
|
|
|
|
case X86::BLSR32rr: case X86::BLSR32rm:
|
|
|
|
case X86::BLSR64rr: case X86::BLSR64rm:
|
|
|
|
case X86::BZHI32rr: case X86::BZHI32rm:
|
|
|
|
case X86::BZHI64rr: case X86::BZHI64rm:
|
|
|
|
case X86::LZCNT16rr: case X86::LZCNT16rm:
|
|
|
|
case X86::LZCNT32rr: case X86::LZCNT32rm:
|
|
|
|
case X86::LZCNT64rr: case X86::LZCNT64rm:
|
|
|
|
case X86::POPCNT16rr:case X86::POPCNT16rm:
|
|
|
|
case X86::POPCNT32rr:case X86::POPCNT32rm:
|
|
|
|
case X86::POPCNT64rr:case X86::POPCNT64rm:
|
|
|
|
case X86::TZCNT16rr: case X86::TZCNT16rm:
|
|
|
|
case X86::TZCNT32rr: case X86::TZCNT32rm:
|
|
|
|
case X86::TZCNT64rr: case X86::TZCNT64rm:
|
2012-07-19 05:40:01 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Check whether the use can be converted to remove a comparison against zero.
|
2014-05-15 00:14:45 +08:00
|
|
|
static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: return X86::COND_INVALID;
|
|
|
|
case X86::LZCNT16rr: case X86::LZCNT16rm:
|
|
|
|
case X86::LZCNT32rr: case X86::LZCNT32rm:
|
|
|
|
case X86::LZCNT64rr: case X86::LZCNT64rm:
|
|
|
|
return X86::COND_B;
|
|
|
|
case X86::POPCNT16rr:case X86::POPCNT16rm:
|
|
|
|
case X86::POPCNT32rr:case X86::POPCNT32rm:
|
|
|
|
case X86::POPCNT64rr:case X86::POPCNT64rm:
|
|
|
|
return X86::COND_E;
|
|
|
|
case X86::TZCNT16rr: case X86::TZCNT16rm:
|
|
|
|
case X86::TZCNT32rr: case X86::TZCNT32rm:
|
|
|
|
case X86::TZCNT64rr: case X86::TZCNT64rm:
|
|
|
|
return X86::COND_B;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Check if there exists an earlier instruction that
|
2012-07-07 01:36:20 +08:00
|
|
|
/// operates on the same source operands and sets flags in the same way as
|
|
|
|
/// Compare; remove Compare if possible.
|
|
|
|
bool X86InstrInfo::
|
|
|
|
optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
|
|
|
|
int CmpMask, int CmpValue,
|
|
|
|
const MachineRegisterInfo *MRI) const {
|
2012-08-08 08:51:41 +08:00
|
|
|
// Check whether we can replace SUB with CMP.
|
|
|
|
unsigned NewOpcode = 0;
|
|
|
|
switch (CmpInstr->getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case X86::SUB64ri32:
|
|
|
|
case X86::SUB64ri8:
|
|
|
|
case X86::SUB32ri:
|
|
|
|
case X86::SUB32ri8:
|
|
|
|
case X86::SUB16ri:
|
|
|
|
case X86::SUB16ri8:
|
|
|
|
case X86::SUB8ri:
|
|
|
|
case X86::SUB64rm:
|
|
|
|
case X86::SUB32rm:
|
|
|
|
case X86::SUB16rm:
|
|
|
|
case X86::SUB8rm:
|
|
|
|
case X86::SUB64rr:
|
|
|
|
case X86::SUB32rr:
|
|
|
|
case X86::SUB16rr:
|
|
|
|
case X86::SUB8rr: {
|
|
|
|
if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
|
|
|
|
return false;
|
|
|
|
// There is no use of the destination register, we can replace SUB with CMP.
|
|
|
|
switch (CmpInstr->getOpcode()) {
|
2012-08-21 16:16:16 +08:00
|
|
|
default: llvm_unreachable("Unreachable!");
|
2012-08-08 08:51:41 +08:00
|
|
|
case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
|
|
|
|
case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
|
|
|
|
case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
|
|
|
|
case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
|
|
|
|
case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
|
|
|
|
case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
|
|
|
|
case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
|
|
|
|
case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
|
|
|
|
case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
|
|
|
|
case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
|
|
|
|
case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
|
|
|
|
case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
|
|
|
|
case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
|
|
|
|
case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
|
|
|
|
case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
|
|
|
|
}
|
|
|
|
CmpInstr->setDesc(get(NewOpcode));
|
|
|
|
CmpInstr->RemoveOperand(0);
|
|
|
|
// Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
|
|
|
|
if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
|
|
|
|
NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-07 01:36:20 +08:00
|
|
|
// Get the unique definition of SrcReg.
|
|
|
|
MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
|
|
|
|
if (!MI) return false;
|
|
|
|
|
|
|
|
// CmpInstr is the first instruction of the BB.
|
|
|
|
MachineBasicBlock::iterator I = CmpInstr, Def = MI;
|
|
|
|
|
2012-07-19 05:40:01 +08:00
|
|
|
// If we are comparing against zero, check whether we can use MI to update
|
|
|
|
// EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
|
|
|
|
bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
|
2014-05-15 00:14:45 +08:00
|
|
|
if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
|
2012-07-19 05:40:01 +08:00
|
|
|
return false;
|
|
|
|
|
2014-05-15 00:14:45 +08:00
|
|
|
// If we have a use of the source register between the def and our compare
|
|
|
|
// instruction we can eliminate the compare iff the use sets EFLAGS in the
|
|
|
|
// right way.
|
|
|
|
bool ShouldUpdateCC = false;
|
|
|
|
X86::CondCode NewCC = X86::COND_INVALID;
|
|
|
|
if (IsCmpZero && !isDefConvertible(MI)) {
|
|
|
|
// Scan forward from the use until we hit the use we're looking for or the
|
|
|
|
// compare instruction.
|
|
|
|
for (MachineBasicBlock::iterator J = MI;; ++J) {
|
|
|
|
// Do we have a convertible instruction?
|
|
|
|
NewCC = isUseDefConvertible(J);
|
|
|
|
if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
|
|
|
|
J->getOperand(1).getReg() == SrcReg) {
|
|
|
|
assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
|
|
|
|
ShouldUpdateCC = true; // Update CC later on.
|
|
|
|
// This is not a def of SrcReg, but still a def of EFLAGS. Keep going
|
|
|
|
// with the new def.
|
|
|
|
MI = Def = J;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (J == I)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-07-07 01:36:20 +08:00
|
|
|
// We are searching for an earlier instruction that can make CmpInstr
|
|
|
|
// redundant and that instruction will be saved in Sub.
|
2014-04-25 13:30:21 +08:00
|
|
|
MachineInstr *Sub = nullptr;
|
2012-07-07 01:36:20 +08:00
|
|
|
const TargetRegisterInfo *TRI = &getRegisterInfo();
|
2012-07-10 02:57:12 +08:00
|
|
|
|
2012-07-07 01:36:20 +08:00
|
|
|
// We iterate backward, starting from the instruction before CmpInstr and
|
|
|
|
// stop when reaching the definition of a source register or done with the BB.
|
|
|
|
// RI points to the instruction before CmpInstr.
|
|
|
|
// If the definition is in this basic block, RE points to the definition;
|
|
|
|
// otherwise, RE is the rend of the basic block.
|
|
|
|
MachineBasicBlock::reverse_iterator
|
|
|
|
RI = MachineBasicBlock::reverse_iterator(I),
|
|
|
|
RE = CmpInstr->getParent() == MI->getParent() ?
|
|
|
|
MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
|
|
|
|
CmpInstr->getParent()->rend();
|
2014-04-25 13:30:21 +08:00
|
|
|
MachineInstr *Movr0Inst = nullptr;
|
2012-07-07 01:36:20 +08:00
|
|
|
for (; RI != RE; ++RI) {
|
|
|
|
MachineInstr *Instr = &*RI;
|
|
|
|
// Check whether CmpInstr can be made redundant by the current instruction.
|
2012-07-19 05:40:01 +08:00
|
|
|
if (!IsCmpZero &&
|
|
|
|
isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
|
2012-07-07 01:36:20 +08:00
|
|
|
Sub = Instr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
|
2012-07-12 03:35:12 +08:00
|
|
|
Instr->readsRegister(X86::EFLAGS, TRI)) {
|
2012-07-07 01:36:20 +08:00
|
|
|
// This instruction modifies or uses EFLAGS.
|
2012-07-12 03:35:12 +08:00
|
|
|
|
|
|
|
// MOV32r0 etc. are implemented with xor which clobbers condition code.
|
|
|
|
// They are safe to move up, if the definition to EFLAGS is dead and
|
|
|
|
// earlier instructions do not read or write EFLAGS.
|
2013-05-30 21:19:42 +08:00
|
|
|
if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
|
2012-07-12 03:35:12 +08:00
|
|
|
Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
|
|
|
|
Movr0Inst = Instr;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-07-07 01:36:20 +08:00
|
|
|
// We can't remove CmpInstr.
|
|
|
|
return false;
|
2012-07-12 03:35:12 +08:00
|
|
|
}
|
2012-07-07 01:36:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Return false if no candidates exist.
|
2012-07-19 05:40:01 +08:00
|
|
|
if (!IsCmpZero && !Sub)
|
2012-07-07 01:36:20 +08:00
|
|
|
return false;
|
|
|
|
|
2012-07-07 11:34:46 +08:00
|
|
|
bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
|
|
|
|
Sub->getOperand(2).getReg() == SrcReg);
|
|
|
|
|
2012-07-07 01:36:20 +08:00
|
|
|
// Scan forward from the instruction after CmpInstr for uses of EFLAGS.
|
2012-07-07 11:34:46 +08:00
|
|
|
// It is safe to remove CmpInstr if EFLAGS is redefined or killed.
|
|
|
|
// If we are done with the basic block, we need to check whether EFLAGS is
|
|
|
|
// live-out.
|
|
|
|
bool IsSafe = false;
|
2012-07-07 01:36:20 +08:00
|
|
|
SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
|
|
|
|
MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
|
|
|
|
for (++I; I != E; ++I) {
|
|
|
|
const MachineInstr &Instr = *I;
|
2012-07-28 11:15:46 +08:00
|
|
|
bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
|
|
|
|
bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
|
|
|
|
// We should check the usage if this instruction uses and updates EFLAGS.
|
|
|
|
if (!UseEFLAGS && ModifyEFLAGS) {
|
2012-07-07 01:36:20 +08:00
|
|
|
// It is safe to remove CmpInstr if EFLAGS is updated again.
|
2012-07-07 11:34:46 +08:00
|
|
|
IsSafe = true;
|
2012-07-07 01:36:20 +08:00
|
|
|
break;
|
2012-07-07 11:34:46 +08:00
|
|
|
}
|
2012-07-28 11:15:46 +08:00
|
|
|
if (!UseEFLAGS && !ModifyEFLAGS)
|
2012-07-07 01:36:20 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// EFLAGS is used by this instruction.
|
2014-06-04 15:45:54 +08:00
|
|
|
X86::CondCode OldCC = X86::COND_INVALID;
|
2012-07-19 05:40:01 +08:00
|
|
|
bool OpcIsSET = false;
|
|
|
|
if (IsCmpZero || IsSwapped) {
|
|
|
|
// We decode the condition code from opcode.
|
2012-07-10 02:57:12 +08:00
|
|
|
if (Instr.isBranch())
|
|
|
|
OldCC = getCondFromBranchOpc(Instr.getOpcode());
|
|
|
|
else {
|
|
|
|
OldCC = getCondFromSETOpc(Instr.getOpcode());
|
|
|
|
if (OldCC != X86::COND_INVALID)
|
|
|
|
OpcIsSET = true;
|
|
|
|
else
|
2012-09-20 11:06:15 +08:00
|
|
|
OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
|
2012-07-10 02:57:12 +08:00
|
|
|
}
|
|
|
|
if (OldCC == X86::COND_INVALID) return false;
|
2012-07-19 05:40:01 +08:00
|
|
|
}
|
|
|
|
if (IsCmpZero) {
|
|
|
|
switch (OldCC) {
|
|
|
|
default: break;
|
|
|
|
case X86::COND_A: case X86::COND_AE:
|
|
|
|
case X86::COND_B: case X86::COND_BE:
|
|
|
|
case X86::COND_G: case X86::COND_GE:
|
|
|
|
case X86::COND_L: case X86::COND_LE:
|
|
|
|
case X86::COND_O: case X86::COND_NO:
|
|
|
|
// CF and OF are used, we can't perform this optimization.
|
|
|
|
return false;
|
|
|
|
}
|
2014-05-15 00:14:45 +08:00
|
|
|
|
|
|
|
// If we're updating the condition code check if we have to reverse the
|
|
|
|
// condition.
|
|
|
|
if (ShouldUpdateCC)
|
|
|
|
switch (OldCC) {
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case X86::COND_E:
|
|
|
|
break;
|
|
|
|
case X86::COND_NE:
|
|
|
|
NewCC = GetOppositeBranchCondition(NewCC);
|
|
|
|
break;
|
|
|
|
}
|
2012-07-19 05:40:01 +08:00
|
|
|
} else if (IsSwapped) {
|
|
|
|
// If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
|
|
|
|
// to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
|
|
|
|
// We swap the condition code and synthesize the new opcode.
|
2014-05-15 00:14:45 +08:00
|
|
|
NewCC = getSwappedCondition(OldCC);
|
2012-07-10 02:57:12 +08:00
|
|
|
if (NewCC == X86::COND_INVALID) return false;
|
2014-05-15 00:14:45 +08:00
|
|
|
}
|
2012-07-10 02:57:12 +08:00
|
|
|
|
2014-05-15 00:14:45 +08:00
|
|
|
if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
|
2012-07-10 02:57:12 +08:00
|
|
|
// Synthesize the new opcode.
|
|
|
|
bool HasMemoryOperand = Instr.hasOneMemOperand();
|
|
|
|
unsigned NewOpc;
|
|
|
|
if (Instr.isBranch())
|
|
|
|
NewOpc = GetCondBranchFromCond(NewCC);
|
|
|
|
else if(OpcIsSET)
|
|
|
|
NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
|
|
|
|
else {
|
|
|
|
unsigned DstReg = Instr.getOperand(0).getReg();
|
|
|
|
NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
|
|
|
|
HasMemoryOperand);
|
|
|
|
}
|
2012-07-07 01:36:20 +08:00
|
|
|
|
|
|
|
// Push the MachineInstr to OpsToUpdate.
|
|
|
|
// If it is safe to remove CmpInstr, the condition code of these
|
|
|
|
// instructions will be modified.
|
|
|
|
OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
|
|
|
|
}
|
2012-07-28 11:15:46 +08:00
|
|
|
if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
|
|
|
|
// It is safe to remove CmpInstr if EFLAGS is updated again or killed.
|
2012-07-07 11:34:46 +08:00
|
|
|
IsSafe = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If EFLAGS is not killed nor re-defined, we should check whether it is
|
|
|
|
// live-out. If it is live-out, do not optimize.
|
2012-07-19 05:40:01 +08:00
|
|
|
if ((IsCmpZero || IsSwapped) && !IsSafe) {
|
2012-07-07 11:34:46 +08:00
|
|
|
MachineBasicBlock *MBB = CmpInstr->getParent();
|
|
|
|
for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
|
|
|
|
SE = MBB->succ_end(); SI != SE; ++SI)
|
|
|
|
if ((*SI)->isLiveIn(X86::EFLAGS))
|
|
|
|
return false;
|
2012-07-07 01:36:20 +08:00
|
|
|
}
|
|
|
|
|
2012-07-19 05:40:01 +08:00
|
|
|
// The instruction to be updated is either Sub or MI.
|
|
|
|
Sub = IsCmpZero ? MI : Sub;
|
2013-05-18 09:02:03 +08:00
|
|
|
// Move Movr0Inst to the appropriate place before Sub.
|
2012-07-12 03:35:12 +08:00
|
|
|
if (Movr0Inst) {
|
2013-05-18 09:02:03 +08:00
|
|
|
// Look backwards until we find a def that doesn't use the current EFLAGS.
|
|
|
|
Def = Sub;
|
|
|
|
MachineBasicBlock::reverse_iterator
|
|
|
|
InsertI = MachineBasicBlock::reverse_iterator(++Def),
|
|
|
|
InsertE = Sub->getParent()->rend();
|
|
|
|
for (; InsertI != InsertE; ++InsertI) {
|
|
|
|
MachineInstr *Instr = &*InsertI;
|
|
|
|
if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
|
|
|
|
Instr->modifiesRegister(X86::EFLAGS, TRI)) {
|
|
|
|
Sub->getParent()->remove(Movr0Inst);
|
|
|
|
Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
|
|
|
|
Movr0Inst);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (InsertI == InsertE)
|
|
|
|
return false;
|
2012-07-12 03:35:12 +08:00
|
|
|
}
|
|
|
|
|
2012-09-18 06:04:23 +08:00
|
|
|
// Make sure Sub instruction defines EFLAGS and mark the def live.
|
2013-05-16 06:03:08 +08:00
|
|
|
unsigned i = 0, e = Sub->getNumOperands();
|
|
|
|
for (; i != e; ++i) {
|
|
|
|
MachineOperand &MO = Sub->getOperand(i);
|
|
|
|
if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
|
|
|
|
MO.setIsDead(false);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(i != e && "Unable to locate a def EFLAGS operand");
|
|
|
|
|
2012-07-07 01:36:20 +08:00
|
|
|
CmpInstr->eraseFromParent();
|
|
|
|
|
|
|
|
// Modify the condition code of instructions in OpsToUpdate.
|
|
|
|
for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
|
|
|
|
OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Try to remove the load by folding it to a register
|
2012-08-02 08:56:42 +08:00
|
|
|
/// operand at the use. We fold the load instructions if load defines a virtual
|
|
|
|
/// register, the virtual register is used once in the same BB, and the
|
|
|
|
/// instructions in-between do not load or store, and have no side effects.
|
2014-10-21 06:14:22 +08:00
|
|
|
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
|
|
|
|
const MachineRegisterInfo *MRI,
|
|
|
|
unsigned &FoldAsLoadDefReg,
|
|
|
|
MachineInstr *&DefMI) const {
|
2012-08-02 08:56:42 +08:00
|
|
|
if (FoldAsLoadDefReg == 0)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2012-08-02 08:56:42 +08:00
|
|
|
// To be conservative, if there exists another load, clear the load candidate.
|
|
|
|
if (MI->mayLoad()) {
|
|
|
|
FoldAsLoadDefReg = 0;
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2012-08-02 08:56:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check whether we can move DefMI here.
|
|
|
|
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
|
|
|
|
assert(DefMI);
|
|
|
|
bool SawStore = false;
|
2015-05-20 05:22:20 +08:00
|
|
|
if (!DefMI->isSafeToMove(nullptr, SawStore))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2012-08-02 08:56:42 +08:00
|
|
|
|
2014-10-21 06:14:22 +08:00
|
|
|
// Collect information about virtual register operands of MI.
|
|
|
|
unsigned SrcOperandId = 0;
|
|
|
|
bool FoundSrcOperand = false;
|
|
|
|
for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand &MO = MI->getOperand(i);
|
|
|
|
if (!MO.isReg())
|
|
|
|
continue;
|
|
|
|
unsigned Reg = MO.getReg();
|
|
|
|
if (Reg != FoldAsLoadDefReg)
|
|
|
|
continue;
|
|
|
|
// Do not fold if we have a subreg use or a def or multiple uses.
|
|
|
|
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
|
2014-10-13 12:17:34 +08:00
|
|
|
return nullptr;
|
2014-10-12 18:52:55 +08:00
|
|
|
|
2014-10-21 06:14:22 +08:00
|
|
|
SrcOperandId = i;
|
|
|
|
FoundSrcOperand = true;
|
2014-10-13 12:17:34 +08:00
|
|
|
}
|
2014-10-21 06:14:22 +08:00
|
|
|
if (!FoundSrcOperand)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// Check whether we can fold the def into SrcOperandId.
|
2015-02-28 20:04:00 +08:00
|
|
|
MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI);
|
2014-10-21 06:14:22 +08:00
|
|
|
if (FoldMI) {
|
|
|
|
FoldAsLoadDefReg = 0;
|
|
|
|
return FoldMI;
|
|
|
|
}
|
|
|
|
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2012-08-02 08:56:42 +08:00
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Expand a single-def pseudo instruction to a two-addr
|
|
|
|
/// instruction with two undef reads of the register being defined.
|
|
|
|
/// This is used for mapping:
|
2011-09-29 13:10:54 +08:00
|
|
|
/// %xmm4 = V_SET0
|
|
|
|
/// to:
|
|
|
|
/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
|
|
|
|
///
|
2012-12-20 05:31:56 +08:00
|
|
|
static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
|
|
|
|
const MCInstrDesc &Desc) {
|
2011-09-29 13:10:54 +08:00
|
|
|
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
|
2012-12-20 05:31:56 +08:00
|
|
|
unsigned Reg = MIB->getOperand(0).getReg();
|
|
|
|
MIB->setDesc(Desc);
|
2011-09-29 13:10:54 +08:00
|
|
|
|
|
|
|
// MachineInstr::addOperand() will insert explicit operands before any
|
|
|
|
// implicit operands.
|
2012-12-20 05:31:56 +08:00
|
|
|
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
|
2011-09-29 13:10:54 +08:00
|
|
|
// But we don't trust that.
|
2012-12-20 05:31:56 +08:00
|
|
|
assert(MIB->getOperand(1).getReg() == Reg &&
|
|
|
|
MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
|
2011-09-29 13:10:54 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-07-26 03:31:34 +08:00
|
|
|
// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
|
|
|
|
// code sequence is needed for other targets.
|
|
|
|
static void expandLoadStackGuard(MachineInstrBuilder &MIB,
|
|
|
|
const TargetInstrInfo &TII) {
|
|
|
|
MachineBasicBlock &MBB = *MIB->getParent();
|
|
|
|
DebugLoc DL = MIB->getDebugLoc();
|
|
|
|
unsigned Reg = MIB->getOperand(0).getReg();
|
|
|
|
const GlobalValue *GV =
|
|
|
|
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
|
|
|
|
unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
|
|
|
|
MachineMemOperand *MMO = MBB.getParent()->
|
|
|
|
getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
|
2014-11-01 07:19:46 +08:00
|
|
|
MachineBasicBlock::iterator I = MIB.getInstr();
|
2014-07-26 03:31:34 +08:00
|
|
|
|
|
|
|
BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
|
|
|
|
.addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
|
|
|
|
.addMemOperand(MMO);
|
|
|
|
MIB->setDebugLoc(DL);
|
|
|
|
MIB->setDesc(TII.get(X86::MOV64rm));
|
|
|
|
MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
|
|
|
|
}
|
|
|
|
|
2011-09-29 13:10:54 +08:00
|
|
|
bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
|
2014-06-11 06:34:31 +08:00
|
|
|
bool HasAVX = Subtarget.hasAVX();
|
2012-12-20 05:31:56 +08:00
|
|
|
MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
|
2011-09-29 13:10:54 +08:00
|
|
|
switch (MI->getOpcode()) {
|
2013-12-31 11:05:38 +08:00
|
|
|
case X86::MOV32r0:
|
|
|
|
return Expand2AddrUndef(MIB, get(X86::XOR32rr));
|
2012-10-05 14:05:15 +08:00
|
|
|
case X86::SETB_C8r:
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(X86::SBB8rr));
|
2012-10-05 14:05:15 +08:00
|
|
|
case X86::SETB_C16r:
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(X86::SBB16rr));
|
2012-10-05 14:05:15 +08:00
|
|
|
case X86::SETB_C32r:
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(X86::SBB32rr));
|
2012-10-05 14:05:15 +08:00
|
|
|
case X86::SETB_C64r:
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(X86::SBB64rr));
|
2011-09-29 13:10:54 +08:00
|
|
|
case X86::V_SET0:
|
2011-11-30 06:27:25 +08:00
|
|
|
case X86::FsFLD0SS:
|
|
|
|
case X86::FsFLD0SD:
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
|
2012-08-28 15:05:28 +08:00
|
|
|
case X86::AVX_SET0:
|
|
|
|
assert(HasAVX && "AVX not supported");
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
|
2013-08-25 20:54:30 +08:00
|
|
|
case X86::AVX512_512_SET0:
|
|
|
|
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
|
2012-08-28 15:30:47 +08:00
|
|
|
case X86::V_SETALLONES:
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
|
2012-08-28 15:30:47 +08:00
|
|
|
case X86::AVX2_SETALLONES:
|
2012-12-20 05:31:56 +08:00
|
|
|
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
|
2011-10-09 02:28:28 +08:00
|
|
|
case X86::TEST8ri_NOREX:
|
|
|
|
MI->setDesc(get(X86::TEST8ri));
|
|
|
|
return true;
|
2014-12-04 13:20:33 +08:00
|
|
|
case X86::KSET0B:
|
2013-08-25 20:54:30 +08:00
|
|
|
case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
|
|
|
|
case X86::KSET1B:
|
|
|
|
case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
|
2014-07-26 03:31:34 +08:00
|
|
|
case TargetOpcode::LOAD_STACK_GUARD:
|
|
|
|
expandLoadStackGuard(MIB, *this);
|
|
|
|
return true;
|
2011-09-29 13:10:54 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2008-07-08 07:14:23 +08:00
|
|
|
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
|
2015-02-28 20:04:00 +08:00
|
|
|
ArrayRef<MachineOperand> MOs,
|
2009-02-03 08:55:04 +08:00
|
|
|
MachineInstr *MI,
|
|
|
|
const TargetInstrInfo &TII) {
|
2008-01-07 09:35:02 +08:00
|
|
|
// Create the base instruction with the memory operand as the first part.
|
2012-12-20 05:31:56 +08:00
|
|
|
// Omit the implicit operands, something BuildMI can't do.
|
2009-02-03 08:55:04 +08:00
|
|
|
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
|
|
|
|
MI->getDebugLoc(), true);
|
2012-12-20 05:31:56 +08:00
|
|
|
MachineInstrBuilder MIB(MF, NewMI);
|
2008-01-07 09:35:02 +08:00
|
|
|
unsigned NumAddrOps = MOs.size();
|
|
|
|
for (unsigned i = 0; i != NumAddrOps; ++i)
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(MOs[i]);
|
2008-01-07 09:35:02 +08:00
|
|
|
if (NumAddrOps < 4) // FrameIndex only
|
2009-04-09 05:14:34 +08:00
|
|
|
addOffset(MIB, 0);
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2008-01-07 09:35:02 +08:00
|
|
|
// Loop over the rest of the ri operands, converting them over.
|
2008-01-07 15:27:27 +08:00
|
|
|
unsigned NumOps = MI->getDesc().getNumOperands()-2;
|
2008-01-07 09:35:02 +08:00
|
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
|
|
MachineOperand &MO = MI->getOperand(i+2);
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(MO);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand &MO = MI->getOperand(i);
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(MO);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
return MIB;
|
|
|
|
}
|
|
|
|
|
2015-02-28 20:04:00 +08:00
|
|
|
static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
|
|
|
|
unsigned OpNo, ArrayRef<MachineOperand> MOs,
|
2008-01-07 09:35:02 +08:00
|
|
|
MachineInstr *MI, const TargetInstrInfo &TII) {
|
2012-12-20 05:31:56 +08:00
|
|
|
// Omit the implicit operands, something BuildMI can't do.
|
2009-02-03 08:55:04 +08:00
|
|
|
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
|
|
|
|
MI->getDebugLoc(), true);
|
2012-12-20 05:31:56 +08:00
|
|
|
MachineInstrBuilder MIB(MF, NewMI);
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2008-01-07 09:35:02 +08:00
|
|
|
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand &MO = MI->getOperand(i);
|
|
|
|
if (i == OpNo) {
|
2008-10-03 23:45:36 +08:00
|
|
|
assert(MO.isReg() && "Expected to fold into reg operand!");
|
2008-01-07 09:35:02 +08:00
|
|
|
unsigned NumAddrOps = MOs.size();
|
|
|
|
for (unsigned i = 0; i != NumAddrOps; ++i)
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(MOs[i]);
|
2008-01-07 09:35:02 +08:00
|
|
|
if (NumAddrOps < 4) // FrameIndex only
|
2009-04-09 05:14:34 +08:00
|
|
|
addOffset(MIB, 0);
|
2008-01-07 09:35:02 +08:00
|
|
|
} else {
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(MO);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return MIB;
|
|
|
|
}
|
|
|
|
|
|
|
|
static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
|
2015-02-28 20:04:00 +08:00
|
|
|
ArrayRef<MachineOperand> MOs,
|
2008-01-07 09:35:02 +08:00
|
|
|
MachineInstr *MI) {
|
2008-07-08 07:14:23 +08:00
|
|
|
MachineFunction &MF = *MI->getParent()->getParent();
|
2009-02-12 05:51:19 +08:00
|
|
|
MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
|
2008-01-07 09:35:02 +08:00
|
|
|
|
|
|
|
unsigned NumAddrOps = MOs.size();
|
|
|
|
for (unsigned i = 0; i != NumAddrOps; ++i)
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(MOs[i]);
|
2008-01-07 09:35:02 +08:00
|
|
|
if (NumAddrOps < 4) // FrameIndex only
|
2009-04-09 05:14:34 +08:00
|
|
|
addOffset(MIB, 0);
|
2008-01-07 09:35:02 +08:00
|
|
|
return MIB.addImm(0);
|
|
|
|
}
|
|
|
|
|
2015-02-28 20:04:00 +08:00
|
|
|
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
|
|
|
MachineInstr *MI,
|
|
|
|
unsigned OpNum,
|
|
|
|
ArrayRef<MachineOperand> MOs,
|
|
|
|
unsigned Size, unsigned Align,
|
|
|
|
bool AllowCommute) const {
|
2014-04-25 13:30:21 +08:00
|
|
|
const DenseMap<unsigned,
|
|
|
|
std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
|
2014-06-11 06:34:31 +08:00
|
|
|
bool isCallRegIndirect = Subtarget.callRegIndirect();
|
2008-01-07 09:35:02 +08:00
|
|
|
bool isTwoAddrFold = false;
|
2013-03-28 07:16:18 +08:00
|
|
|
|
2015-02-10 00:04:52 +08:00
|
|
|
// For CPUs that favor the register form of a call,
|
|
|
|
// do not fold loads into calls.
|
2013-03-28 07:16:18 +08:00
|
|
|
if (isCallRegIndirect &&
|
2015-02-10 00:04:52 +08:00
|
|
|
(MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2013-03-28 07:16:18 +08:00
|
|
|
|
2008-01-07 15:27:27 +08:00
|
|
|
unsigned NumOps = MI->getDesc().getNumOperands();
|
2008-01-07 09:35:02 +08:00
|
|
|
bool isTwoAddr = NumOps > 1 &&
|
2011-06-29 03:10:37 +08:00
|
|
|
MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
|
2008-01-07 09:35:02 +08:00
|
|
|
|
2011-05-01 07:00:05 +08:00
|
|
|
// FIXME: AsmPrinter doesn't know how to handle
|
|
|
|
// X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
|
|
|
|
if (MI->getOpcode() == X86::ADD32ri &&
|
|
|
|
MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2011-05-01 07:00:05 +08:00
|
|
|
|
2014-04-25 13:30:21 +08:00
|
|
|
MachineInstr *NewMI = nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
// Folding a memory location into the two-address part of a two-address
|
|
|
|
// instruction is different than folding it other places. It requires
|
|
|
|
// replacing the *two* registers with the memory location.
|
2015-02-10 00:30:58 +08:00
|
|
|
if (isTwoAddr && NumOps >= 2 && OpNum < 2 &&
|
2008-10-03 23:45:36 +08:00
|
|
|
MI->getOperand(0).isReg() &&
|
|
|
|
MI->getOperand(1).isReg() &&
|
2011-01-26 10:03:37 +08:00
|
|
|
MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
|
2008-01-07 09:35:02 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable2Addr;
|
|
|
|
isTwoAddrFold = true;
|
2015-02-10 00:30:58 +08:00
|
|
|
} else if (OpNum == 0) {
|
2013-05-30 21:19:42 +08:00
|
|
|
if (MI->getOpcode() == X86::MOV32r0) {
|
|
|
|
NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
|
|
|
|
if (NewMI)
|
|
|
|
return NewMI;
|
2012-08-23 12:57:36 +08:00
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2008-01-07 09:35:02 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable0;
|
2015-02-10 00:30:58 +08:00
|
|
|
} else if (OpNum == 1) {
|
2008-01-07 09:35:02 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable1;
|
2015-02-10 00:30:58 +08:00
|
|
|
} else if (OpNum == 2) {
|
2008-01-07 09:35:02 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable2;
|
2015-02-10 00:30:58 +08:00
|
|
|
} else if (OpNum == 3) {
|
2012-08-01 20:06:00 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable3;
|
2015-02-10 00:30:58 +08:00
|
|
|
} else if (OpNum == 4) {
|
2014-12-18 20:28:22 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable4;
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2008-01-07 09:35:02 +08:00
|
|
|
// If table selected...
|
|
|
|
if (OpcodeTablePtr) {
|
|
|
|
// Find the Opcode to fuse
|
2010-10-08 07:08:41 +08:00
|
|
|
DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
|
|
|
|
OpcodeTablePtr->find(MI->getOpcode());
|
2008-01-07 09:35:02 +08:00
|
|
|
if (I != OpcodeTablePtr->end()) {
|
2009-09-11 08:39:26 +08:00
|
|
|
unsigned Opcode = I->second.first;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
|
2009-07-15 14:10:07 +08:00
|
|
|
if (Align < MinAlign)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2009-09-11 09:01:31 +08:00
|
|
|
bool NarrowToMOV32rm = false;
|
2009-09-11 08:39:26 +08:00
|
|
|
if (Size) {
|
2015-02-10 00:30:58 +08:00
|
|
|
unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
|
2009-09-11 08:39:26 +08:00
|
|
|
if (Size < RCSize) {
|
|
|
|
// Check if it's safe to fold the load. If the size of the object is
|
|
|
|
// narrower than the load width, then it's not.
|
|
|
|
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2009-09-11 08:39:26 +08:00
|
|
|
// If this is a 64-bit load, but the spill slot is 32, then we can do
|
2014-10-21 06:14:22 +08:00
|
|
|
// a 32-bit load which is implicitly zero-extended. This likely is
|
|
|
|
// due to live interval analysis remat'ing a load from stack slot.
|
2009-09-11 09:01:31 +08:00
|
|
|
if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2009-09-11 08:39:26 +08:00
|
|
|
Opcode = X86::MOV32rm;
|
2009-09-11 09:01:31 +08:00
|
|
|
NarrowToMOV32rm = true;
|
2009-09-11 08:39:26 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-01-07 09:35:02 +08:00
|
|
|
if (isTwoAddrFold)
|
2009-09-11 08:39:26 +08:00
|
|
|
NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
|
2008-01-07 09:35:02 +08:00
|
|
|
else
|
2015-02-10 00:30:58 +08:00
|
|
|
NewMI = FuseInst(MF, Opcode, OpNum, MOs, MI, *this);
|
2009-09-11 09:01:31 +08:00
|
|
|
|
|
|
|
if (NarrowToMOV32rm) {
|
|
|
|
// If this is the special case where we use a MOV32rm to load a 32-bit
|
|
|
|
// value and zero-extend the top bits. Change the destination register
|
|
|
|
// to a 32-bit one.
|
|
|
|
unsigned DstReg = NewMI->getOperand(0).getReg();
|
|
|
|
if (TargetRegisterInfo::isPhysicalRegister(DstReg))
|
2014-10-21 06:14:22 +08:00
|
|
|
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
|
2009-09-11 09:01:31 +08:00
|
|
|
else
|
2010-05-24 22:48:17 +08:00
|
|
|
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
|
2009-09-11 09:01:31 +08:00
|
|
|
}
|
2008-01-07 09:35:02 +08:00
|
|
|
return NewMI;
|
|
|
|
}
|
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2014-10-21 06:14:22 +08:00
|
|
|
// If the instruction and target operand are commutable, commute the
|
|
|
|
// instruction and try again.
|
|
|
|
if (AllowCommute) {
|
2015-02-10 00:30:58 +08:00
|
|
|
unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;
|
2014-10-21 06:14:22 +08:00
|
|
|
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
|
|
|
|
bool HasDef = MI->getDesc().getNumDefs();
|
|
|
|
unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
|
|
|
|
unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
|
|
|
|
unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
|
|
|
|
bool Tied0 =
|
|
|
|
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
|
|
|
|
bool Tied1 =
|
|
|
|
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
|
|
|
|
|
|
|
|
// If either of the commutable operands are tied to the destination
|
|
|
|
// then we can not commute + fold.
|
|
|
|
if ((HasDef && Reg0 == Reg1 && Tied0) ||
|
|
|
|
(HasDef && Reg0 == Reg2 && Tied1))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
if ((CommuteOpIdx1 == OriginalOpIdx) ||
|
|
|
|
(CommuteOpIdx2 == OriginalOpIdx)) {
|
|
|
|
MachineInstr *CommutedMI = commuteInstruction(MI, false);
|
|
|
|
if (!CommutedMI) {
|
|
|
|
// Unable to commute.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
if (CommutedMI != MI) {
|
|
|
|
// New instruction. We can't fold from this.
|
|
|
|
CommutedMI->eraseFromParent();
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Attempt to fold with the commuted version of the instruction.
|
|
|
|
unsigned CommuteOp =
|
|
|
|
(CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
|
|
|
|
NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
|
|
|
|
/*AllowCommute=*/false);
|
|
|
|
if (NewMI)
|
|
|
|
return NewMI;
|
|
|
|
|
|
|
|
// Folding failed again - undo the commute before returning.
|
|
|
|
MachineInstr *UncommutedMI = commuteInstruction(MI, false);
|
|
|
|
if (!UncommutedMI) {
|
|
|
|
// Unable to commute.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
if (UncommutedMI != MI) {
|
|
|
|
// New instruction. It doesn't need to be kept.
|
|
|
|
UncommutedMI->eraseFromParent();
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return here to prevent duplicate fuse failure report.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
// No fusion
|
2010-07-10 04:43:09 +08:00
|
|
|
if (PrintFailedFusing && !MI->isCopy())
|
2015-02-10 00:30:58 +08:00
|
|
|
dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return true for all instructions that only update
|
2011-09-16 05:42:23 +08:00
|
|
|
/// the first 32 or 64-bits of the destination register and leave the rest
|
|
|
|
/// unmodified. This can be used to avoid folding loads if the instructions
|
|
|
|
/// only update part of the destination register, and the non-updated part is
|
|
|
|
/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
|
|
|
|
/// instructions breaks the partial register dependency and it can improve
|
|
|
|
/// performance. e.g.:
|
|
|
|
///
|
|
|
|
/// movss (%rdi), %xmm0
|
|
|
|
/// cvtss2sd %xmm0, %xmm0
|
|
|
|
///
|
|
|
|
/// Instead of
|
|
|
|
/// cvtss2sd (%rdi), %xmm0
|
|
|
|
///
|
2011-09-16 07:04:24 +08:00
|
|
|
/// FIXME: This should be turned into a TSFlags.
|
|
|
|
///
|
2011-09-16 05:42:23 +08:00
|
|
|
static bool hasPartialRegUpdate(unsigned Opcode) {
|
|
|
|
switch (Opcode) {
|
2011-11-15 09:15:30 +08:00
|
|
|
case X86::CVTSI2SSrr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::CVTSI2SSrm:
|
2011-11-15 09:15:30 +08:00
|
|
|
case X86::CVTSI2SS64rr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::CVTSI2SS64rm:
|
2011-11-15 09:15:30 +08:00
|
|
|
case X86::CVTSI2SDrr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::CVTSI2SDrm:
|
2011-11-15 09:15:30 +08:00
|
|
|
case X86::CVTSI2SD64rr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::CVTSI2SD64rm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::CVTSD2SSrr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::CVTSD2SSrm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::Int_CVTSD2SSrr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::Int_CVTSD2SSrm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::CVTSS2SDrr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::CVTSS2SDrm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::Int_CVTSS2SDrr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::Int_CVTSS2SDrm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::RCPSSr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::RCPSSm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::RCPSSr_Int:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::RCPSSm_Int:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::ROUNDSDr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::ROUNDSDm:
|
2011-12-09 23:43:55 +08:00
|
|
|
case X86::ROUNDSDr_Int:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::ROUNDSSr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::ROUNDSSm:
|
2011-12-09 23:43:55 +08:00
|
|
|
case X86::ROUNDSSr_Int:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::RSQRTSSr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::RSQRTSSm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::RSQRTSSr_Int:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::RSQRTSSm_Int:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::SQRTSSr:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::SQRTSSm:
|
2011-09-16 05:42:23 +08:00
|
|
|
case X86::SQRTSSr_Int:
|
2014-12-15 21:18:21 +08:00
|
|
|
case X86::SQRTSSm_Int:
|
|
|
|
case X86::SQRTSDr:
|
|
|
|
case X86::SQRTSDm:
|
|
|
|
case X86::SQRTSDr_Int:
|
|
|
|
case X86::SQRTSDm_Int:
|
2011-09-16 05:42:23 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2008-01-07 09:35:02 +08:00
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Inform the ExeDepsFix pass how many idle
|
2011-11-15 09:15:30 +08:00
|
|
|
/// instructions we would like before a partial register update.
|
|
|
|
unsigned X86InstrInfo::
|
|
|
|
getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
|
|
|
|
const TargetRegisterInfo *TRI) const {
|
|
|
|
if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
// If MI is marked as reading Reg, the partial register update is wanted.
|
|
|
|
const MachineOperand &MO = MI->getOperand(0);
|
|
|
|
unsigned Reg = MO.getReg();
|
|
|
|
if (TargetRegisterInfo::isVirtualRegister(Reg)) {
|
|
|
|
if (MO.readsReg() || MI->readsVirtualRegister(Reg))
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
if (MI->readsRegister(Reg, TRI))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If any of the preceding 16 instructions are reading Reg, insert a
|
|
|
|
// dependency breaking instruction. The magic number is based on a few
|
|
|
|
// Nehalem experiments.
|
|
|
|
return 16;
|
|
|
|
}
|
|
|
|
|
2013-10-15 06:19:03 +08:00
|
|
|
// Return true for any instruction the copies the high bits of the first source
|
|
|
|
// operand into the unused high bits of the destination operand.
|
|
|
|
static bool hasUndefRegUpdate(unsigned Opcode) {
|
|
|
|
switch (Opcode) {
|
|
|
|
case X86::VCVTSI2SSrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSI2SSrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::Int_VCVTSI2SSrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::Int_VCVTSI2SSrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VCVTSI2SS64rr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSI2SS64rm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::Int_VCVTSI2SS64rr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::Int_VCVTSI2SS64rm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VCVTSI2SDrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSI2SDrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::Int_VCVTSI2SDrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::Int_VCVTSI2SDrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VCVTSI2SD64rr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSI2SD64rm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::Int_VCVTSI2SD64rr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::Int_VCVTSI2SD64rm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VCVTSD2SSrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSD2SSrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::Int_VCVTSD2SSrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::Int_VCVTSD2SSrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VCVTSS2SDrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSS2SDrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::Int_VCVTSS2SDrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::Int_VCVTSS2SDrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VRCPSSr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VRCPSSm:
|
|
|
|
case X86::VRCPSSm_Int:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VROUNDSDr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VROUNDSDm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VROUNDSDr_Int:
|
|
|
|
case X86::VROUNDSSr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VROUNDSSm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VROUNDSSr_Int:
|
|
|
|
case X86::VRSQRTSSr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VRSQRTSSm:
|
|
|
|
case X86::VRSQRTSSm_Int:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VSQRTSSr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VSQRTSSm:
|
|
|
|
case X86::VSQRTSSm_Int:
|
|
|
|
case X86::VSQRTSDr:
|
|
|
|
case X86::VSQRTSDm:
|
|
|
|
case X86::VSQRTSDm_Int:
|
|
|
|
// AVX-512
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VCVTSD2SSZrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSD2SSZrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
case X86::VCVTSS2SDZrr:
|
2014-12-28 21:15:05 +08:00
|
|
|
case X86::VCVTSS2SDZrm:
|
2013-10-15 06:19:03 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Inform the ExeDepsFix pass how many idle instructions we would like before
|
|
|
|
/// certain undef register reads.
|
|
|
|
///
|
|
|
|
/// This catches the VCVTSI2SD family of instructions:
|
|
|
|
///
|
|
|
|
/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
|
|
|
|
///
|
|
|
|
/// We should to be careful *not* to catch VXOR idioms which are presumably
|
|
|
|
/// handled specially in the pipeline:
|
|
|
|
///
|
|
|
|
/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
|
|
|
|
///
|
|
|
|
/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
|
|
|
|
/// high bits that are passed-through are not live.
|
|
|
|
unsigned X86InstrInfo::
|
|
|
|
getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
|
|
|
|
const TargetRegisterInfo *TRI) const {
|
|
|
|
if (!hasUndefRegUpdate(MI->getOpcode()))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
// Set the OpNum parameter to the first source operand.
|
|
|
|
OpNum = 1;
|
|
|
|
|
|
|
|
const MachineOperand &MO = MI->getOperand(OpNum);
|
|
|
|
if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
|
|
|
|
// Use the same magic number as getPartialRegUpdateClearance.
|
|
|
|
return 16;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-11-15 09:15:30 +08:00
|
|
|
void X86InstrInfo::
|
|
|
|
breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
|
|
|
|
const TargetRegisterInfo *TRI) const {
|
|
|
|
unsigned Reg = MI->getOperand(OpNum).getReg();
|
2013-10-15 06:19:03 +08:00
|
|
|
// If MI kills this register, the false dependence is already broken.
|
|
|
|
if (MI->killsRegister(Reg, TRI))
|
|
|
|
return;
|
2011-11-15 09:15:30 +08:00
|
|
|
if (X86::VR128RegClass.contains(Reg)) {
|
|
|
|
// These instructions are all floating point domain, so xorps is the best
|
|
|
|
// choice.
|
2014-06-11 06:34:31 +08:00
|
|
|
bool HasAVX = Subtarget.hasAVX();
|
2011-11-15 09:15:30 +08:00
|
|
|
unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
|
|
|
|
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
|
|
|
|
.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
|
|
|
|
} else if (X86::VR256RegClass.contains(Reg)) {
|
|
|
|
// Use vxorps to clear the full ymm register.
|
|
|
|
// It wants to read and write the xmm sub-register.
|
|
|
|
unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
|
|
|
|
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
|
|
|
|
.addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
|
|
|
|
.addReg(Reg, RegState::ImplicitDefine);
|
|
|
|
} else
|
|
|
|
return;
|
|
|
|
MI->addRegisterKilled(Reg, TRI, true);
|
|
|
|
}
|
|
|
|
|
2015-02-28 20:04:00 +08:00
|
|
|
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
|
|
|
MachineInstr *MI,
|
|
|
|
ArrayRef<unsigned> Ops,
|
|
|
|
int FrameIndex) const {
|
2011-01-26 10:03:37 +08:00
|
|
|
// Check switch flag
|
2014-04-25 13:30:21 +08:00
|
|
|
if (NoFusing) return nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
|
2011-09-16 05:42:23 +08:00
|
|
|
// Unless optimizing for size, don't fold to avoid partial
|
|
|
|
// register update stalls
|
2015-02-14 09:59:52 +08:00
|
|
|
if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
|
2011-09-16 05:42:23 +08:00
|
|
|
hasPartialRegUpdate(MI->getOpcode()))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2009-12-18 15:40:29 +08:00
|
|
|
|
2008-02-09 05:20:40 +08:00
|
|
|
const MachineFrameInfo *MFI = MF.getFrameInfo();
|
2009-09-11 08:39:26 +08:00
|
|
|
unsigned Size = MFI->getObjectSize(FrameIndex);
|
2008-02-09 05:20:40 +08:00
|
|
|
unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
|
2013-10-06 21:48:22 +08:00
|
|
|
// If the function stack isn't realigned we don't want to fold instructions
|
|
|
|
// that need increased alignment.
|
|
|
|
if (!RI.needsStackRealignment(MF))
|
2015-02-03 01:38:43 +08:00
|
|
|
Alignment =
|
|
|
|
std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
|
2008-01-07 09:35:02 +08:00
|
|
|
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
|
|
|
|
unsigned NewOpc = 0;
|
2009-09-11 08:39:26 +08:00
|
|
|
unsigned RCSize = 0;
|
2008-01-07 09:35:02 +08:00
|
|
|
switch (MI->getOpcode()) {
|
2014-04-25 13:30:21 +08:00
|
|
|
default: return nullptr;
|
2009-09-11 08:39:26 +08:00
|
|
|
case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
|
2010-05-19 05:42:03 +08:00
|
|
|
case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
|
|
|
|
case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
|
|
|
|
case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
2009-09-11 08:39:26 +08:00
|
|
|
// Check if it's safe to fold the load. If the size of the object is
|
|
|
|
// narrower than the load width, then it's not.
|
|
|
|
if (Size < RCSize)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
// Change to CMPXXri r, 0 first.
|
2008-01-12 02:10:50 +08:00
|
|
|
MI->setDesc(get(NewOpc));
|
2008-01-07 09:35:02 +08:00
|
|
|
MI->getOperand(1).ChangeToImmediate(0);
|
|
|
|
} else if (Ops.size() != 1)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
|
2015-02-28 20:04:00 +08:00
|
|
|
return foldMemoryOperandImpl(MF, MI, Ops[0],
|
|
|
|
MachineOperand::CreateFI(FrameIndex), Size,
|
|
|
|
Alignment, /*AllowCommute=*/true);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
[X86] Fix a bug in X86's peephole optimization.
Peephole optimization was folding MOVSDrm, which is a zero-extending double
precision floating point load, into ADDPDrr, which is a SIMD add of two packed
double precision floating point values.
(before)
%vreg21<def> = MOVSDrm <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg21
%vreg23<def,tied1> = ADDPDrr %vreg20<tied0>, %vreg21; VR128:%vreg23,%vreg20,%vreg21
(after)
%vreg23<def,tied1> = ADDPDrm %vreg20<tied0>, <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg23,%vreg20
X86InstrInfo::foldMemoryOperandImpl already had the logic that prevented this
from happening. However the check wasn't being conducted for loads from stack
objects. This commit factors out the logic into a new function and uses it for
checking loads from stack slots are not zero-extending loads.
rdar://problem/18236850
llvm-svn: 217799
2014-09-16 02:23:52 +08:00
|
|
|
static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
|
|
|
|
const MachineFunction &MF) {
|
|
|
|
unsigned Opc = LoadMI.getOpcode();
|
|
|
|
unsigned RegSize =
|
|
|
|
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
|
|
|
|
|
|
|
|
if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
|
|
|
|
// These instructions only load 32 bits, we can't fold them if the
|
|
|
|
// destination register is wider than 32 bits (4 bytes).
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
|
|
|
|
// These instructions only load 64 bits, we can't fold them if the
|
|
|
|
// destination register is wider than 64 bits (8 bytes).
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-02-28 20:04:00 +08:00
|
|
|
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
2008-12-04 02:43:12 +08:00
|
|
|
MachineInstr *MI,
|
2015-02-28 20:04:00 +08:00
|
|
|
ArrayRef<unsigned> Ops,
|
2008-12-04 02:43:12 +08:00
|
|
|
MachineInstr *LoadMI) const {
|
2013-11-13 02:06:12 +08:00
|
|
|
// If loading from a FrameIndex, fold directly from the FrameIndex.
|
|
|
|
unsigned NumOps = LoadMI->getDesc().getNumOperands();
|
|
|
|
int FrameIndex;
|
[X86] Fix a bug in X86's peephole optimization.
Peephole optimization was folding MOVSDrm, which is a zero-extending double
precision floating point load, into ADDPDrr, which is a SIMD add of two packed
double precision floating point values.
(before)
%vreg21<def> = MOVSDrm <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg21
%vreg23<def,tied1> = ADDPDrr %vreg20<tied0>, %vreg21; VR128:%vreg23,%vreg20,%vreg21
(after)
%vreg23<def,tied1> = ADDPDrm %vreg20<tied0>, <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg23,%vreg20
X86InstrInfo::foldMemoryOperandImpl already had the logic that prevented this
from happening. However the check wasn't being conducted for loads from stack
objects. This commit factors out the logic into a new function and uses it for
checking loads from stack slots are not zero-extending loads.
rdar://problem/18236850
llvm-svn: 217799
2014-09-16 02:23:52 +08:00
|
|
|
if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
|
|
|
|
if (isPartialRegisterLoad(*LoadMI, MF))
|
|
|
|
return nullptr;
|
2013-11-13 02:06:12 +08:00
|
|
|
return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
|
[X86] Fix a bug in X86's peephole optimization.
Peephole optimization was folding MOVSDrm, which is a zero-extending double
precision floating point load, into ADDPDrr, which is a SIMD add of two packed
double precision floating point values.
(before)
%vreg21<def> = MOVSDrm <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg21
%vreg23<def,tied1> = ADDPDrr %vreg20<tied0>, %vreg21; VR128:%vreg23,%vreg20,%vreg21
(after)
%vreg23<def,tied1> = ADDPDrm %vreg20<tied0>, <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg23,%vreg20
X86InstrInfo::foldMemoryOperandImpl already had the logic that prevented this
from happening. However the check wasn't being conducted for loads from stack
objects. This commit factors out the logic into a new function and uses it for
checking loads from stack slots are not zero-extending loads.
rdar://problem/18236850
llvm-svn: 217799
2014-09-16 02:23:52 +08:00
|
|
|
}
|
2013-11-13 02:06:12 +08:00
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
// Check switch flag
|
2014-04-25 13:30:21 +08:00
|
|
|
if (NoFusing) return nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
|
2011-09-16 05:42:23 +08:00
|
|
|
// Unless optimizing for size, don't fold to avoid partial
|
|
|
|
// register update stalls
|
2015-02-14 09:59:52 +08:00
|
|
|
if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
|
2011-09-16 05:42:23 +08:00
|
|
|
hasPartialRegUpdate(MI->getOpcode()))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2009-12-18 15:40:29 +08:00
|
|
|
|
2008-07-12 08:10:52 +08:00
|
|
|
// Determine the alignment of the load.
|
2008-02-09 05:20:40 +08:00
|
|
|
unsigned Alignment = 0;
|
2008-07-12 08:10:52 +08:00
|
|
|
if (LoadMI->hasOneMemOperand())
|
2009-09-26 04:36:54 +08:00
|
|
|
Alignment = (*LoadMI->memoperands_begin())->getAlignment();
|
2009-09-22 02:30:38 +08:00
|
|
|
else
|
|
|
|
switch (LoadMI->getOpcode()) {
|
2011-11-20 06:34:59 +08:00
|
|
|
case X86::AVX2_SETALLONES:
|
2012-08-28 15:05:28 +08:00
|
|
|
case X86::AVX_SET0:
|
2010-08-13 04:20:53 +08:00
|
|
|
Alignment = 32;
|
|
|
|
break;
|
2011-09-29 13:10:54 +08:00
|
|
|
case X86::V_SET0:
|
2009-09-22 02:30:38 +08:00
|
|
|
case X86::V_SETALLONES:
|
|
|
|
Alignment = 16;
|
|
|
|
break;
|
|
|
|
case X86::FsFLD0SD:
|
|
|
|
Alignment = 8;
|
|
|
|
break;
|
|
|
|
case X86::FsFLD0SS:
|
|
|
|
Alignment = 4;
|
|
|
|
break;
|
|
|
|
default:
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2009-09-22 02:30:38 +08:00
|
|
|
}
|
2008-01-07 09:35:02 +08:00
|
|
|
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
|
|
|
|
unsigned NewOpc = 0;
|
|
|
|
switch (MI->getOpcode()) {
|
2014-04-25 13:30:21 +08:00
|
|
|
default: return nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
|
2010-05-19 05:54:15 +08:00
|
|
|
case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
|
|
|
|
case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
|
|
|
|
case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
// Change to CMPXXri r, 0 first.
|
2008-01-12 02:10:50 +08:00
|
|
|
MI->setDesc(get(NewOpc));
|
2008-01-07 09:35:02 +08:00
|
|
|
MI->getOperand(1).ChangeToImmediate(0);
|
|
|
|
} else if (Ops.size() != 1)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
|
2010-08-12 07:08:22 +08:00
|
|
|
// Make sure the subregisters match.
|
|
|
|
// Otherwise we risk changing the size of the load.
|
|
|
|
if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2010-08-12 07:08:22 +08:00
|
|
|
|
2010-07-09 06:41:28 +08:00
|
|
|
SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
|
2009-09-22 02:30:38 +08:00
|
|
|
switch (LoadMI->getOpcode()) {
|
2011-09-29 13:10:54 +08:00
|
|
|
case X86::V_SET0:
|
2009-09-22 02:30:38 +08:00
|
|
|
case X86::V_SETALLONES:
|
2011-11-20 06:34:59 +08:00
|
|
|
case X86::AVX2_SETALLONES:
|
2012-08-28 15:05:28 +08:00
|
|
|
case X86::AVX_SET0:
|
2009-09-22 02:30:38 +08:00
|
|
|
case X86::FsFLD0SD:
|
2011-11-30 06:27:25 +08:00
|
|
|
case X86::FsFLD0SS: {
|
2011-09-29 13:10:54 +08:00
|
|
|
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
|
2008-12-03 13:21:24 +08:00
|
|
|
// Create a constant-pool entry and operands to load from it.
|
|
|
|
|
2010-03-09 11:01:40 +08:00
|
|
|
// Medium and large mode can't fold loads this way.
|
2014-06-11 06:34:31 +08:00
|
|
|
if (MF.getTarget().getCodeModel() != CodeModel::Small &&
|
|
|
|
MF.getTarget().getCodeModel() != CodeModel::Kernel)
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2010-03-09 11:01:40 +08:00
|
|
|
|
2008-12-03 13:21:24 +08:00
|
|
|
// x86-32 PIC requires a PIC base register for constant pools.
|
|
|
|
unsigned PICBase = 0;
|
2014-06-11 06:34:31 +08:00
|
|
|
if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
|
|
|
|
if (Subtarget.is64Bit())
|
2009-07-17 02:44:05 +08:00
|
|
|
PICBase = X86::RIP;
|
2009-07-17 05:24:13 +08:00
|
|
|
else
|
2010-07-10 17:00:22 +08:00
|
|
|
// FIXME: PICBase = getGlobalBaseReg(&MF);
|
2009-07-17 02:44:05 +08:00
|
|
|
// This doesn't work for several reasons.
|
|
|
|
// 1. GlobalBaseReg may have been spilled.
|
|
|
|
// 2. It may not be live at MI.
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2009-07-17 05:24:13 +08:00
|
|
|
}
|
2008-12-03 13:21:24 +08:00
|
|
|
|
2009-09-22 02:30:38 +08:00
|
|
|
// Create a constant-pool entry.
|
2008-12-03 13:21:24 +08:00
|
|
|
MachineConstantPool &MCP = *MF.getConstantPool();
|
2011-07-18 12:54:35 +08:00
|
|
|
Type *Ty;
|
2010-08-13 04:20:53 +08:00
|
|
|
unsigned Opc = LoadMI->getOpcode();
|
2011-11-30 06:27:25 +08:00
|
|
|
if (Opc == X86::FsFLD0SS)
|
2009-09-22 02:30:38 +08:00
|
|
|
Ty = Type::getFloatTy(MF.getFunction()->getContext());
|
2011-11-30 06:27:25 +08:00
|
|
|
else if (Opc == X86::FsFLD0SD)
|
2009-09-22 02:30:38 +08:00
|
|
|
Ty = Type::getDoubleTy(MF.getFunction()->getContext());
|
2012-08-28 15:05:28 +08:00
|
|
|
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
|
2012-01-13 14:12:41 +08:00
|
|
|
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
|
2009-09-22 02:30:38 +08:00
|
|
|
else
|
|
|
|
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
|
2011-07-26 07:05:32 +08:00
|
|
|
|
2012-08-28 15:30:47 +08:00
|
|
|
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
|
2011-07-26 07:05:32 +08:00
|
|
|
const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
|
|
|
|
Constant::getNullValue(Ty);
|
2009-09-22 02:30:38 +08:00
|
|
|
unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
|
2008-12-03 13:21:24 +08:00
|
|
|
|
|
|
|
// Create operands to load from the constant pool entry.
|
|
|
|
MOs.push_back(MachineOperand::CreateReg(PICBase, false));
|
|
|
|
MOs.push_back(MachineOperand::CreateImm(1));
|
|
|
|
MOs.push_back(MachineOperand::CreateReg(0, false));
|
|
|
|
MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
|
2009-04-09 05:14:34 +08:00
|
|
|
MOs.push_back(MachineOperand::CreateReg(0, false));
|
2009-09-22 02:30:38 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
[X86] Fix a bug in X86's peephole optimization.
Peephole optimization was folding MOVSDrm, which is a zero-extending double
precision floating point load, into ADDPDrr, which is a SIMD add of two packed
double precision floating point values.
(before)
%vreg21<def> = MOVSDrm <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg21
%vreg23<def,tied1> = ADDPDrr %vreg20<tied0>, %vreg21; VR128:%vreg23,%vreg20,%vreg21
(after)
%vreg23<def,tied1> = ADDPDrm %vreg20<tied0>, <fi#0>, 1, %noreg, 0, %noreg; mem:LD8[%7](align=16)(tbaa=<badref>) VR128:%vreg23,%vreg20
X86InstrInfo::foldMemoryOperandImpl already had the logic that prevented this
from happening. However the check wasn't being conducted for loads from stack
objects. This commit factors out the logic into a new function and uses it for
checking loads from stack slots are not zero-extending loads.
rdar://problem/18236850
llvm-svn: 217799
2014-09-16 02:23:52 +08:00
|
|
|
if (isPartialRegisterLoad(*LoadMI, MF))
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2012-11-28 02:09:26 +08:00
|
|
|
|
2008-12-03 13:21:24 +08:00
|
|
|
// Folding a normal load. Just copy the load's address operands.
|
2015-02-28 21:20:15 +08:00
|
|
|
MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands,
|
|
|
|
LoadMI->operands_begin() + NumOps);
|
2009-09-22 02:30:38 +08:00
|
|
|
break;
|
|
|
|
}
|
2008-12-03 13:21:24 +08:00
|
|
|
}
|
2014-10-21 06:14:22 +08:00
|
|
|
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
|
|
|
|
/*Size=*/0, Alignment, /*AllowCommute=*/true);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
2008-10-16 09:49:15 +08:00
|
|
|
bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
|
2015-02-28 20:04:00 +08:00
|
|
|
ArrayRef<unsigned> Ops) const {
|
2011-01-26 10:03:37 +08:00
|
|
|
// Check switch flag
|
2008-01-07 09:35:02 +08:00
|
|
|
if (NoFusing) return 0;
|
|
|
|
|
|
|
|
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
default: return false;
|
2011-01-26 10:03:37 +08:00
|
|
|
case X86::TEST8rr:
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::TEST16rr:
|
|
|
|
case X86::TEST32rr:
|
|
|
|
case X86::TEST64rr:
|
|
|
|
return true;
|
2011-05-01 07:00:05 +08:00
|
|
|
case X86::ADD32ri:
|
|
|
|
// FIXME: AsmPrinter doesn't know how to handle
|
|
|
|
// X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
|
|
|
|
if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
|
|
|
|
return false;
|
|
|
|
break;
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Ops.size() != 1)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned OpNum = Ops[0];
|
|
|
|
unsigned Opc = MI->getOpcode();
|
2008-01-07 15:27:27 +08:00
|
|
|
unsigned NumOps = MI->getDesc().getNumOperands();
|
2008-01-07 09:35:02 +08:00
|
|
|
bool isTwoAddr = NumOps > 1 &&
|
2011-06-29 03:10:37 +08:00
|
|
|
MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
|
2008-01-07 09:35:02 +08:00
|
|
|
|
|
|
|
// Folding a memory location into the two-address part of a two-address
|
|
|
|
// instruction is different than folding it other places. It requires
|
|
|
|
// replacing the *two* registers with the memory location.
|
2014-04-25 13:30:21 +08:00
|
|
|
const DenseMap<unsigned,
|
|
|
|
std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
|
2011-01-26 10:03:37 +08:00
|
|
|
if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
|
2008-01-07 09:35:02 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable2Addr;
|
2015-02-10 00:30:58 +08:00
|
|
|
} else if (OpNum == 0) {
|
2013-05-30 21:19:42 +08:00
|
|
|
if (Opc == X86::MOV32r0)
|
|
|
|
return true;
|
|
|
|
|
2008-01-07 09:35:02 +08:00
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable0;
|
|
|
|
} else if (OpNum == 1) {
|
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable1;
|
|
|
|
} else if (OpNum == 2) {
|
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable2;
|
2012-09-01 06:12:16 +08:00
|
|
|
} else if (OpNum == 3) {
|
|
|
|
OpcodeTablePtr = &RegOp2MemOpTable3;
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2010-10-08 11:54:52 +08:00
|
|
|
if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
|
|
|
|
return true;
|
2012-11-28 10:35:17 +08:00
|
|
|
return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
|
|
|
|
unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
|
2009-02-12 05:51:19 +08:00
|
|
|
SmallVectorImpl<MachineInstr*> &NewMIs) const {
|
2010-10-08 07:08:41 +08:00
|
|
|
DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
|
|
|
|
MemOp2RegOpTable.find(MI->getOpcode());
|
2008-01-07 09:35:02 +08:00
|
|
|
if (I == MemOp2RegOpTable.end())
|
|
|
|
return false;
|
|
|
|
unsigned Opc = I->second.first;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
unsigned Index = I->second.second & TB_INDEX_MASK;
|
|
|
|
bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
|
|
|
|
bool FoldedStore = I->second.second & TB_FOLDED_STORE;
|
2008-01-07 09:35:02 +08:00
|
|
|
if (UnfoldLoad && !FoldedLoad)
|
|
|
|
return false;
|
|
|
|
UnfoldLoad &= FoldedLoad;
|
|
|
|
if (UnfoldStore && !FoldedStore)
|
|
|
|
return false;
|
|
|
|
UnfoldStore &= FoldedStore;
|
|
|
|
|
2011-06-29 03:10:37 +08:00
|
|
|
const MCInstrDesc &MCID = get(Opc);
|
2012-05-08 06:10:26 +08:00
|
|
|
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
|
2010-07-03 04:36:18 +08:00
|
|
|
if (!MI->hasOneMemOperand() &&
|
|
|
|
RC == &X86::VR128RegClass &&
|
2014-06-11 06:34:31 +08:00
|
|
|
!Subtarget.isUnalignedMemAccessFast())
|
2010-07-03 04:36:18 +08:00
|
|
|
// Without memoperands, loadRegFromAddr and storeRegToStackSlot will
|
|
|
|
// conservatively assume the address is unaligned. That's bad for
|
|
|
|
// performance.
|
|
|
|
return false;
|
2010-07-09 06:41:28 +08:00
|
|
|
SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
|
2008-01-07 09:35:02 +08:00
|
|
|
SmallVector<MachineOperand,2> BeforeOps;
|
|
|
|
SmallVector<MachineOperand,2> AfterOps;
|
|
|
|
SmallVector<MachineOperand,4> ImpOps;
|
|
|
|
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand &Op = MI->getOperand(i);
|
2010-07-09 06:41:28 +08:00
|
|
|
if (i >= Index && i < Index + X86::AddrNumOperands)
|
2008-01-07 09:35:02 +08:00
|
|
|
AddrOps.push_back(Op);
|
2008-10-03 23:45:36 +08:00
|
|
|
else if (Op.isReg() && Op.isImplicit())
|
2008-01-07 09:35:02 +08:00
|
|
|
ImpOps.push_back(Op);
|
|
|
|
else if (i < Index)
|
|
|
|
BeforeOps.push_back(Op);
|
|
|
|
else if (i > Index)
|
|
|
|
AfterOps.push_back(Op);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the load instruction.
|
|
|
|
if (UnfoldLoad) {
|
2009-10-10 02:10:05 +08:00
|
|
|
std::pair<MachineInstr::mmo_iterator,
|
|
|
|
MachineInstr::mmo_iterator> MMOs =
|
|
|
|
MF.extractLoadMemRefs(MI->memoperands_begin(),
|
|
|
|
MI->memoperands_end());
|
|
|
|
loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
|
2008-01-07 09:35:02 +08:00
|
|
|
if (UnfoldStore) {
|
|
|
|
// Address operands cannot be marked isKill.
|
2010-07-09 06:41:28 +08:00
|
|
|
for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
|
2008-01-07 09:35:02 +08:00
|
|
|
MachineOperand &MO = NewMIs[0]->getOperand(i);
|
2008-10-03 23:45:36 +08:00
|
|
|
if (MO.isReg())
|
2008-01-07 09:35:02 +08:00
|
|
|
MO.setIsKill(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the data processing instruction.
|
2011-06-29 03:10:37 +08:00
|
|
|
MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
|
2012-12-20 05:31:56 +08:00
|
|
|
MachineInstrBuilder MIB(MF, DataMI);
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2008-01-07 09:35:02 +08:00
|
|
|
if (FoldedStore)
|
2009-05-14 05:33:08 +08:00
|
|
|
MIB.addReg(Reg, RegState::Define);
|
2008-01-07 09:35:02 +08:00
|
|
|
for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(BeforeOps[i]);
|
2008-01-07 09:35:02 +08:00
|
|
|
if (FoldedLoad)
|
|
|
|
MIB.addReg(Reg);
|
|
|
|
for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
|
2009-02-18 13:45:50 +08:00
|
|
|
MIB.addOperand(AfterOps[i]);
|
2008-01-07 09:35:02 +08:00
|
|
|
for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
|
|
|
|
MachineOperand &MO = ImpOps[i];
|
2009-05-14 05:33:08 +08:00
|
|
|
MIB.addReg(MO.getReg(),
|
|
|
|
getDefRegState(MO.isDef()) |
|
|
|
|
RegState::Implicit |
|
|
|
|
getKillRegState(MO.isKill()) |
|
2009-06-30 16:49:04 +08:00
|
|
|
getDeadRegState(MO.isDead()) |
|
|
|
|
getUndefRegState(MO.isUndef()));
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
// Change CMP32ri r, 0 back to TEST32rr r, r, etc.
|
|
|
|
switch (DataMI->getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case X86::CMP64ri32:
|
2010-05-19 05:54:15 +08:00
|
|
|
case X86::CMP64ri8:
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::CMP32ri:
|
2010-05-19 05:54:15 +08:00
|
|
|
case X86::CMP32ri8:
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::CMP16ri:
|
2010-05-19 05:54:15 +08:00
|
|
|
case X86::CMP16ri8:
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::CMP8ri: {
|
|
|
|
MachineOperand &MO0 = DataMI->getOperand(0);
|
|
|
|
MachineOperand &MO1 = DataMI->getOperand(1);
|
|
|
|
if (MO1.getImm() == 0) {
|
2012-08-21 16:16:16 +08:00
|
|
|
unsigned NewOpc;
|
2008-01-07 09:35:02 +08:00
|
|
|
switch (DataMI->getOpcode()) {
|
2012-08-21 16:16:16 +08:00
|
|
|
default: llvm_unreachable("Unreachable!");
|
2010-05-19 05:54:15 +08:00
|
|
|
case X86::CMP64ri8:
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
|
2010-05-19 05:54:15 +08:00
|
|
|
case X86::CMP32ri8:
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
|
2010-05-19 05:54:15 +08:00
|
|
|
case X86::CMP16ri8:
|
2008-01-07 09:35:02 +08:00
|
|
|
case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
|
|
|
|
case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
|
|
|
|
}
|
2008-01-12 02:10:50 +08:00
|
|
|
DataMI->setDesc(get(NewOpc));
|
2008-01-07 09:35:02 +08:00
|
|
|
MO1.ChangeToRegister(MO0.getReg(), false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
NewMIs.push_back(DataMI);
|
|
|
|
|
|
|
|
// Emit the store instruction.
|
|
|
|
if (UnfoldStore) {
|
2012-05-08 06:10:26 +08:00
|
|
|
const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
|
2009-10-10 02:10:05 +08:00
|
|
|
std::pair<MachineInstr::mmo_iterator,
|
|
|
|
MachineInstr::mmo_iterator> MMOs =
|
|
|
|
MF.extractStoreMemRefs(MI->memoperands_begin(),
|
|
|
|
MI->memoperands_end());
|
|
|
|
storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
|
2009-02-12 05:51:19 +08:00
|
|
|
SmallVectorImpl<SDNode*> &NewNodes) const {
|
2008-07-18 03:10:17 +08:00
|
|
|
if (!N->isMachineOpcode())
|
2008-01-07 09:35:02 +08:00
|
|
|
return false;
|
|
|
|
|
2010-10-08 07:08:41 +08:00
|
|
|
DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
|
|
|
|
MemOp2RegOpTable.find(N->getMachineOpcode());
|
2008-01-07 09:35:02 +08:00
|
|
|
if (I == MemOp2RegOpTable.end())
|
|
|
|
return false;
|
|
|
|
unsigned Opc = I->second.first;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
unsigned Index = I->second.second & TB_INDEX_MASK;
|
|
|
|
bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
|
|
|
|
bool FoldedStore = I->second.second & TB_FOLDED_STORE;
|
2011-06-29 03:10:37 +08:00
|
|
|
const MCInstrDesc &MCID = get(Opc);
|
2012-05-08 06:10:26 +08:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
|
2011-06-29 03:10:37 +08:00
|
|
|
unsigned NumDefs = MCID.NumDefs;
|
2008-07-28 05:46:04 +08:00
|
|
|
std::vector<SDValue> AddrOps;
|
|
|
|
std::vector<SDValue> BeforeOps;
|
|
|
|
std::vector<SDValue> AfterOps;
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2008-01-07 09:35:02 +08:00
|
|
|
unsigned NumOps = N->getNumOperands();
|
2009-09-26 04:36:54 +08:00
|
|
|
for (unsigned i = 0; i != NumOps-1; ++i) {
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Op = N->getOperand(i);
|
2010-07-09 06:41:28 +08:00
|
|
|
if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
|
2008-01-07 09:35:02 +08:00
|
|
|
AddrOps.push_back(Op);
|
2009-03-05 03:23:38 +08:00
|
|
|
else if (i < Index-NumDefs)
|
2008-01-07 09:35:02 +08:00
|
|
|
BeforeOps.push_back(Op);
|
2009-03-05 03:23:38 +08:00
|
|
|
else if (i > Index-NumDefs)
|
2008-01-07 09:35:02 +08:00
|
|
|
AfterOps.push_back(Op);
|
|
|
|
}
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Chain = N->getOperand(NumOps-1);
|
2008-01-07 09:35:02 +08:00
|
|
|
AddrOps.push_back(Chain);
|
|
|
|
|
|
|
|
// Emit the load instruction.
|
2014-04-25 13:30:21 +08:00
|
|
|
SDNode *Load = nullptr;
|
2008-01-07 09:35:02 +08:00
|
|
|
if (FoldedLoad) {
|
2009-08-11 06:56:29 +08:00
|
|
|
EVT VT = *RC->vt_begin();
|
2009-11-17 05:56:03 +08:00
|
|
|
std::pair<MachineInstr::mmo_iterator,
|
|
|
|
MachineInstr::mmo_iterator> MMOs =
|
|
|
|
MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
|
|
|
|
cast<MachineSDNode>(N)->memoperands_end());
|
2010-07-03 04:36:18 +08:00
|
|
|
if (!(*MMOs.first) &&
|
|
|
|
RC == &X86::VR128RegClass &&
|
2014-06-11 06:34:31 +08:00
|
|
|
!Subtarget.isUnalignedMemAccessFast())
|
2010-07-03 04:36:18 +08:00
|
|
|
// Do not introduce a slow unaligned load.
|
|
|
|
return false;
|
2011-09-14 10:36:58 +08:00
|
|
|
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
|
|
|
|
bool isAligned = (*MMOs.first) &&
|
|
|
|
(*MMOs.first)->getAlignment() >= Alignment;
|
2014-06-11 06:34:31 +08:00
|
|
|
Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
|
2013-04-20 06:22:57 +08:00
|
|
|
VT, MVT::Other, AddrOps);
|
2008-01-07 09:35:02 +08:00
|
|
|
NewNodes.push_back(Load);
|
2009-10-10 02:10:05 +08:00
|
|
|
|
|
|
|
// Preserve memory reference information.
|
|
|
|
cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the data processing instruction.
|
2009-08-11 06:56:29 +08:00
|
|
|
std::vector<EVT> VTs;
|
2014-04-25 13:30:21 +08:00
|
|
|
const TargetRegisterClass *DstRC = nullptr;
|
2011-06-29 03:10:37 +08:00
|
|
|
if (MCID.getNumDefs() > 0) {
|
2012-05-08 06:10:26 +08:00
|
|
|
DstRC = getRegClass(MCID, 0, &RI, MF);
|
2008-01-07 09:35:02 +08:00
|
|
|
VTs.push_back(*DstRC->vt_begin());
|
|
|
|
}
|
|
|
|
for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
|
2009-08-11 06:56:29 +08:00
|
|
|
EVT VT = N->getValueType(i);
|
2011-06-29 03:10:37 +08:00
|
|
|
if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
|
2008-01-07 09:35:02 +08:00
|
|
|
VTs.push_back(VT);
|
|
|
|
}
|
|
|
|
if (Load)
|
2008-07-28 05:46:04 +08:00
|
|
|
BeforeOps.push_back(SDValue(Load, 0));
|
2015-02-28 18:11:12 +08:00
|
|
|
BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
|
2013-04-20 06:22:57 +08:00
|
|
|
SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
|
2008-01-07 09:35:02 +08:00
|
|
|
NewNodes.push_back(NewNode);
|
|
|
|
|
|
|
|
// Emit the store instruction.
|
|
|
|
if (FoldedStore) {
|
|
|
|
AddrOps.pop_back();
|
2008-07-28 05:46:04 +08:00
|
|
|
AddrOps.push_back(SDValue(NewNode, 0));
|
2008-01-07 09:35:02 +08:00
|
|
|
AddrOps.push_back(Chain);
|
2009-11-17 05:56:03 +08:00
|
|
|
std::pair<MachineInstr::mmo_iterator,
|
|
|
|
MachineInstr::mmo_iterator> MMOs =
|
|
|
|
MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
|
|
|
|
cast<MachineSDNode>(N)->memoperands_end());
|
2010-07-03 04:36:18 +08:00
|
|
|
if (!(*MMOs.first) &&
|
|
|
|
RC == &X86::VR128RegClass &&
|
2014-06-11 06:34:31 +08:00
|
|
|
!Subtarget.isUnalignedMemAccessFast())
|
2010-07-03 04:36:18 +08:00
|
|
|
// Do not introduce a slow unaligned store.
|
|
|
|
return false;
|
2011-09-14 10:36:58 +08:00
|
|
|
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
|
|
|
|
bool isAligned = (*MMOs.first) &&
|
|
|
|
(*MMOs.first)->getAlignment() >= Alignment;
|
2014-06-11 06:34:31 +08:00
|
|
|
SDNode *Store =
|
|
|
|
DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
|
|
|
|
dl, MVT::Other, AddrOps);
|
2008-01-07 09:35:02 +08:00
|
|
|
NewNodes.push_back(Store);
|
2009-10-10 02:10:05 +08:00
|
|
|
|
|
|
|
// Preserve memory reference information.
|
2015-02-10 14:29:28 +08:00
|
|
|
cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
|
2008-01-07 09:35:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
|
2009-10-31 06:18:41 +08:00
|
|
|
bool UnfoldLoad, bool UnfoldStore,
|
|
|
|
unsigned *LoadRegIndex) const {
|
2010-10-08 07:08:41 +08:00
|
|
|
DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
|
|
|
|
MemOp2RegOpTable.find(Opc);
|
2008-01-07 09:35:02 +08:00
|
|
|
if (I == MemOp2RegOpTable.end())
|
|
|
|
return 0;
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
|
|
|
|
bool FoldedStore = I->second.second & TB_FOLDED_STORE;
|
2008-01-07 09:35:02 +08:00
|
|
|
if (UnfoldLoad && !FoldedLoad)
|
|
|
|
return 0;
|
|
|
|
if (UnfoldStore && !FoldedStore)
|
|
|
|
return 0;
|
2009-10-31 06:18:41 +08:00
|
|
|
if (LoadRegIndex)
|
* Combines Alignment, AuxInfo, and TB_NOT_REVERSABLE flag into a
single field (Flags), which is a bitwise OR of items from the TB_*
enum. This makes it easier to add new information in the future.
* Gives every static array an equivalent layout: { RegOp, MemOp, Flags }
* Adds a helper function, AddTableEntry, to avoid duplication of the
insertion code.
* Renames TB_NOT_REVERSABLE to TB_NO_REVERSE.
* Adds TB_NO_FORWARD, which is analogous to TB_NO_REVERSE, except that
it prevents addition of the Reg->Mem entry. (This is going to be used
by Native Client, in the next CL).
Patch by David Meyer
llvm-svn: 139311
2011-09-09 02:35:57 +08:00
|
|
|
*LoadRegIndex = I->second.second & TB_INDEX_MASK;
|
2008-01-07 09:35:02 +08:00
|
|
|
return I->second.first;
|
|
|
|
}
|
|
|
|
|
2010-01-22 11:34:51 +08:00
|
|
|
bool
|
|
|
|
X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
|
|
|
|
int64_t &Offset1, int64_t &Offset2) const {
|
|
|
|
if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
|
|
|
|
return false;
|
|
|
|
unsigned Opc1 = Load1->getMachineOpcode();
|
|
|
|
unsigned Opc2 = Load2->getMachineOpcode();
|
|
|
|
switch (Opc1) {
|
|
|
|
default: return false;
|
|
|
|
case X86::MOV8rm:
|
|
|
|
case X86::MOV16rm:
|
|
|
|
case X86::MOV32rm:
|
|
|
|
case X86::MOV64rm:
|
|
|
|
case X86::LD_Fp32m:
|
|
|
|
case X86::LD_Fp64m:
|
|
|
|
case X86::LD_Fp80m:
|
|
|
|
case X86::MOVSSrm:
|
|
|
|
case X86::MOVSDrm:
|
|
|
|
case X86::MMX_MOVD64rm:
|
|
|
|
case X86::MMX_MOVQ64rm:
|
|
|
|
case X86::FsMOVAPSrm:
|
|
|
|
case X86::FsMOVAPDrm:
|
|
|
|
case X86::MOVAPSrm:
|
|
|
|
case X86::MOVUPSrm:
|
|
|
|
case X86::MOVAPDrm:
|
|
|
|
case X86::MOVDQArm:
|
|
|
|
case X86::MOVDQUrm:
|
2011-09-16 06:15:52 +08:00
|
|
|
// AVX load instructions
|
|
|
|
case X86::VMOVSSrm:
|
|
|
|
case X86::VMOVSDrm:
|
|
|
|
case X86::FsVMOVAPSrm:
|
|
|
|
case X86::FsVMOVAPDrm:
|
2011-09-14 10:36:58 +08:00
|
|
|
case X86::VMOVAPSrm:
|
|
|
|
case X86::VMOVUPSrm:
|
|
|
|
case X86::VMOVAPDrm:
|
|
|
|
case X86::VMOVDQArm:
|
|
|
|
case X86::VMOVDQUrm:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVAPSYrm:
|
|
|
|
case X86::VMOVUPSYrm:
|
|
|
|
case X86::VMOVAPDYrm:
|
|
|
|
case X86::VMOVDQAYrm:
|
|
|
|
case X86::VMOVDQUYrm:
|
2010-01-22 11:34:51 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
switch (Opc2) {
|
|
|
|
default: return false;
|
|
|
|
case X86::MOV8rm:
|
|
|
|
case X86::MOV16rm:
|
|
|
|
case X86::MOV32rm:
|
|
|
|
case X86::MOV64rm:
|
|
|
|
case X86::LD_Fp32m:
|
|
|
|
case X86::LD_Fp64m:
|
|
|
|
case X86::LD_Fp80m:
|
|
|
|
case X86::MOVSSrm:
|
|
|
|
case X86::MOVSDrm:
|
|
|
|
case X86::MMX_MOVD64rm:
|
|
|
|
case X86::MMX_MOVQ64rm:
|
|
|
|
case X86::FsMOVAPSrm:
|
|
|
|
case X86::FsMOVAPDrm:
|
|
|
|
case X86::MOVAPSrm:
|
|
|
|
case X86::MOVUPSrm:
|
|
|
|
case X86::MOVAPDrm:
|
|
|
|
case X86::MOVDQArm:
|
|
|
|
case X86::MOVDQUrm:
|
2011-09-16 06:15:52 +08:00
|
|
|
// AVX load instructions
|
|
|
|
case X86::VMOVSSrm:
|
|
|
|
case X86::VMOVSDrm:
|
|
|
|
case X86::FsVMOVAPSrm:
|
|
|
|
case X86::FsVMOVAPDrm:
|
2011-09-14 10:36:58 +08:00
|
|
|
case X86::VMOVAPSrm:
|
|
|
|
case X86::VMOVUPSrm:
|
|
|
|
case X86::VMOVAPDrm:
|
|
|
|
case X86::VMOVDQArm:
|
|
|
|
case X86::VMOVDQUrm:
|
2011-07-15 02:50:58 +08:00
|
|
|
case X86::VMOVAPSYrm:
|
|
|
|
case X86::VMOVUPSYrm:
|
|
|
|
case X86::VMOVAPDYrm:
|
|
|
|
case X86::VMOVDQAYrm:
|
|
|
|
case X86::VMOVDQUYrm:
|
2010-01-22 11:34:51 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if chain operands and base addresses match.
|
|
|
|
if (Load1->getOperand(0) != Load2->getOperand(0) ||
|
|
|
|
Load1->getOperand(5) != Load2->getOperand(5))
|
|
|
|
return false;
|
|
|
|
// Segment operands should match as well.
|
|
|
|
if (Load1->getOperand(4) != Load2->getOperand(4))
|
|
|
|
return false;
|
|
|
|
// Scale should be 1, Index should be Reg0.
|
|
|
|
if (Load1->getOperand(1) == Load2->getOperand(1) &&
|
|
|
|
Load1->getOperand(2) == Load2->getOperand(2)) {
|
|
|
|
if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Now let's examine the displacements.
|
|
|
|
if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
|
|
|
|
isa<ConstantSDNode>(Load2->getOperand(3))) {
|
|
|
|
Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
|
|
|
|
Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
|
|
|
|
int64_t Offset1, int64_t Offset2,
|
|
|
|
unsigned NumLoads) const {
|
|
|
|
assert(Offset2 > Offset1);
|
|
|
|
if ((Offset2 - Offset1) / 8 > 64)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned Opc1 = Load1->getMachineOpcode();
|
|
|
|
unsigned Opc2 = Load2->getMachineOpcode();
|
|
|
|
if (Opc1 != Opc2)
|
|
|
|
return false; // FIXME: overly conservative?
|
|
|
|
|
|
|
|
switch (Opc1) {
|
|
|
|
default: break;
|
|
|
|
case X86::LD_Fp32m:
|
|
|
|
case X86::LD_Fp64m:
|
|
|
|
case X86::LD_Fp80m:
|
|
|
|
case X86::MMX_MOVD64rm:
|
|
|
|
case X86::MMX_MOVQ64rm:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
EVT VT = Load1->getValueType(0);
|
|
|
|
switch (VT.getSimpleVT().SimpleTy) {
|
2010-06-23 06:16:17 +08:00
|
|
|
default:
|
2010-01-22 11:34:51 +08:00
|
|
|
// XMM registers. In 64-bit mode we can be a bit more aggressive since we
|
|
|
|
// have 16 of them to play with.
|
2014-06-11 06:34:31 +08:00
|
|
|
if (Subtarget.is64Bit()) {
|
2010-01-22 11:34:51 +08:00
|
|
|
if (NumLoads >= 3)
|
|
|
|
return false;
|
2010-06-23 06:16:17 +08:00
|
|
|
} else if (NumLoads) {
|
2010-01-22 11:34:51 +08:00
|
|
|
return false;
|
2010-06-23 06:16:17 +08:00
|
|
|
}
|
2010-01-22 11:34:51 +08:00
|
|
|
break;
|
|
|
|
case MVT::i8:
|
|
|
|
case MVT::i16:
|
|
|
|
case MVT::i32:
|
|
|
|
case MVT::i64:
|
2010-01-23 07:49:11 +08:00
|
|
|
case MVT::f32:
|
|
|
|
case MVT::f64:
|
2010-01-22 11:34:51 +08:00
|
|
|
if (NumLoads)
|
|
|
|
return false;
|
2010-06-23 06:16:17 +08:00
|
|
|
break;
|
2010-01-22 11:34:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-06-23 17:00:28 +08:00
|
|
|
bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
|
|
|
|
MachineInstr *Second) const {
|
|
|
|
// Check if this processor supports macro-fusion. Since this is a minor
|
|
|
|
// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
|
|
|
|
// proxy for SandyBridge+.
|
2014-06-11 06:34:31 +08:00
|
|
|
if (!Subtarget.hasAVX())
|
2013-06-23 17:00:28 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
enum {
|
|
|
|
FuseTest,
|
|
|
|
FuseCmp,
|
|
|
|
FuseInc
|
|
|
|
} FuseKind;
|
|
|
|
|
|
|
|
switch(Second->getOpcode()) {
|
|
|
|
default:
|
|
|
|
return false;
|
2015-01-06 12:23:53 +08:00
|
|
|
case X86::JE_1:
|
|
|
|
case X86::JNE_1:
|
|
|
|
case X86::JL_1:
|
|
|
|
case X86::JLE_1:
|
|
|
|
case X86::JG_1:
|
|
|
|
case X86::JGE_1:
|
2013-06-23 17:00:28 +08:00
|
|
|
FuseKind = FuseInc;
|
|
|
|
break;
|
2015-01-06 12:23:53 +08:00
|
|
|
case X86::JB_1:
|
|
|
|
case X86::JBE_1:
|
|
|
|
case X86::JA_1:
|
|
|
|
case X86::JAE_1:
|
2013-06-23 17:00:28 +08:00
|
|
|
FuseKind = FuseCmp;
|
|
|
|
break;
|
2015-01-06 12:23:53 +08:00
|
|
|
case X86::JS_1:
|
|
|
|
case X86::JNS_1:
|
|
|
|
case X86::JP_1:
|
|
|
|
case X86::JNP_1:
|
|
|
|
case X86::JO_1:
|
|
|
|
case X86::JNO_1:
|
2013-06-23 17:00:28 +08:00
|
|
|
FuseKind = FuseTest;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
switch (First->getOpcode()) {
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case X86::TEST8rr:
|
|
|
|
case X86::TEST16rr:
|
|
|
|
case X86::TEST32rr:
|
|
|
|
case X86::TEST64rr:
|
|
|
|
case X86::TEST8ri:
|
|
|
|
case X86::TEST16ri:
|
|
|
|
case X86::TEST32ri:
|
|
|
|
case X86::TEST32i32:
|
|
|
|
case X86::TEST64i32:
|
|
|
|
case X86::TEST64ri32:
|
|
|
|
case X86::TEST8rm:
|
|
|
|
case X86::TEST16rm:
|
|
|
|
case X86::TEST32rm:
|
|
|
|
case X86::TEST64rm:
|
2014-07-11 02:00:53 +08:00
|
|
|
case X86::TEST8ri_NOREX:
|
2013-06-23 17:00:28 +08:00
|
|
|
case X86::AND16i16:
|
|
|
|
case X86::AND16ri:
|
|
|
|
case X86::AND16ri8:
|
|
|
|
case X86::AND16rm:
|
|
|
|
case X86::AND16rr:
|
|
|
|
case X86::AND32i32:
|
|
|
|
case X86::AND32ri:
|
|
|
|
case X86::AND32ri8:
|
|
|
|
case X86::AND32rm:
|
|
|
|
case X86::AND32rr:
|
|
|
|
case X86::AND64i32:
|
|
|
|
case X86::AND64ri32:
|
|
|
|
case X86::AND64ri8:
|
|
|
|
case X86::AND64rm:
|
|
|
|
case X86::AND64rr:
|
|
|
|
case X86::AND8i8:
|
|
|
|
case X86::AND8ri:
|
|
|
|
case X86::AND8rm:
|
|
|
|
case X86::AND8rr:
|
|
|
|
return true;
|
|
|
|
case X86::CMP16i16:
|
|
|
|
case X86::CMP16ri:
|
|
|
|
case X86::CMP16ri8:
|
|
|
|
case X86::CMP16rm:
|
|
|
|
case X86::CMP16rr:
|
|
|
|
case X86::CMP32i32:
|
|
|
|
case X86::CMP32ri:
|
|
|
|
case X86::CMP32ri8:
|
|
|
|
case X86::CMP32rm:
|
|
|
|
case X86::CMP32rr:
|
|
|
|
case X86::CMP64i32:
|
|
|
|
case X86::CMP64ri32:
|
|
|
|
case X86::CMP64ri8:
|
|
|
|
case X86::CMP64rm:
|
|
|
|
case X86::CMP64rr:
|
|
|
|
case X86::CMP8i8:
|
|
|
|
case X86::CMP8ri:
|
|
|
|
case X86::CMP8rm:
|
|
|
|
case X86::CMP8rr:
|
|
|
|
case X86::ADD16i16:
|
|
|
|
case X86::ADD16ri:
|
|
|
|
case X86::ADD16ri8:
|
|
|
|
case X86::ADD16ri8_DB:
|
|
|
|
case X86::ADD16ri_DB:
|
|
|
|
case X86::ADD16rm:
|
|
|
|
case X86::ADD16rr:
|
|
|
|
case X86::ADD16rr_DB:
|
|
|
|
case X86::ADD32i32:
|
|
|
|
case X86::ADD32ri:
|
|
|
|
case X86::ADD32ri8:
|
|
|
|
case X86::ADD32ri8_DB:
|
|
|
|
case X86::ADD32ri_DB:
|
|
|
|
case X86::ADD32rm:
|
|
|
|
case X86::ADD32rr:
|
|
|
|
case X86::ADD32rr_DB:
|
|
|
|
case X86::ADD64i32:
|
|
|
|
case X86::ADD64ri32:
|
|
|
|
case X86::ADD64ri32_DB:
|
|
|
|
case X86::ADD64ri8:
|
|
|
|
case X86::ADD64ri8_DB:
|
|
|
|
case X86::ADD64rm:
|
|
|
|
case X86::ADD64rr:
|
|
|
|
case X86::ADD64rr_DB:
|
|
|
|
case X86::ADD8i8:
|
|
|
|
case X86::ADD8mi:
|
|
|
|
case X86::ADD8mr:
|
|
|
|
case X86::ADD8ri:
|
|
|
|
case X86::ADD8rm:
|
|
|
|
case X86::ADD8rr:
|
|
|
|
case X86::SUB16i16:
|
|
|
|
case X86::SUB16ri:
|
|
|
|
case X86::SUB16ri8:
|
|
|
|
case X86::SUB16rm:
|
|
|
|
case X86::SUB16rr:
|
|
|
|
case X86::SUB32i32:
|
|
|
|
case X86::SUB32ri:
|
|
|
|
case X86::SUB32ri8:
|
|
|
|
case X86::SUB32rm:
|
|
|
|
case X86::SUB32rr:
|
|
|
|
case X86::SUB64i32:
|
|
|
|
case X86::SUB64ri32:
|
|
|
|
case X86::SUB64ri8:
|
|
|
|
case X86::SUB64rm:
|
|
|
|
case X86::SUB64rr:
|
|
|
|
case X86::SUB8i8:
|
|
|
|
case X86::SUB8ri:
|
|
|
|
case X86::SUB8rm:
|
|
|
|
case X86::SUB8rr:
|
|
|
|
return FuseKind == FuseCmp || FuseKind == FuseInc;
|
|
|
|
case X86::INC16r:
|
|
|
|
case X86::INC32r:
|
|
|
|
case X86::INC64r:
|
|
|
|
case X86::INC8r:
|
|
|
|
case X86::DEC16r:
|
|
|
|
case X86::DEC32r:
|
|
|
|
case X86::DEC64r:
|
|
|
|
case X86::DEC8r:
|
|
|
|
return FuseKind == FuseInc;
|
|
|
|
}
|
|
|
|
}
|
2010-01-22 11:34:51 +08:00
|
|
|
|
2006-10-21 01:42:20 +08:00
|
|
|
bool X86InstrInfo::
|
2008-08-15 06:49:33 +08:00
|
|
|
ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
|
2006-10-21 13:52:40 +08:00
|
|
|
assert(Cond.size() == 1 && "Invalid X86 branch condition!");
|
2008-08-30 07:21:31 +08:00
|
|
|
X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
|
Optimized FCMP_OEQ and FCMP_UNE for x86.
Where previously LLVM might emit code like this:
ucomisd %xmm1, %xmm0
setne %al
setp %cl
orb %al, %cl
jne .LBB4_2
it now emits this:
ucomisd %xmm1, %xmm0
jne .LBB4_2
jp .LBB4_2
It has fewer instructions and uses fewer registers, but it does
have more branches. And in the case that this code is followed by
a non-fallthrough edge, it may be followed by a jmp instruction,
resulting in three branch instructions in sequence. Some effort
is made to avoid this situation.
To achieve this, X86ISelLowering.cpp now recognizes FCMP_OEQ and
FCMP_UNE in lowered form, and replace them with code that emits
two branches, except in the case where it would require converting
a fall-through edge to an explicit branch.
Also, X86InstrInfo.cpp's branch analysis and transform code now
knows now to handle blocks with multiple conditional branches. It
uses loops instead of having fixed checks for up to two
instructions. It can now analyze and transform code generated
from FCMP_OEQ and FCMP_UNE.
llvm-svn: 57873
2008-10-21 11:29:32 +08:00
|
|
|
if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
|
|
|
|
return true;
|
2008-08-30 07:21:31 +08:00
|
|
|
Cond[0].setImm(GetOppositeBranchCondition(CC));
|
2006-10-21 13:52:40 +08:00
|
|
|
return false;
|
2006-10-21 01:42:20 +08:00
|
|
|
}
|
|
|
|
|
2008-10-27 15:14:50 +08:00
|
|
|
bool X86InstrInfo::
|
2009-02-07 01:17:30 +08:00
|
|
|
isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
|
|
|
|
// FIXME: Return false for x87 stack register classes for now. We can't
|
2008-10-27 15:14:50 +08:00
|
|
|
// allow any loads of these registers before FpGet_ST0_80.
|
2009-02-07 01:17:30 +08:00
|
|
|
return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
|
|
|
|
RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
|
2008-10-27 15:14:50 +08:00
|
|
|
}
|
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return a virtual register initialized with the
|
2008-09-30 08:58:23 +08:00
|
|
|
/// the global base register value. Output instructions required to
|
|
|
|
/// initialize the register in the function entry block, if necessary.
|
2008-09-24 02:22:58 +08:00
|
|
|
///
|
2010-07-10 17:00:22 +08:00
|
|
|
/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
|
|
|
|
///
|
2008-09-30 08:58:23 +08:00
|
|
|
unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
|
2014-06-11 06:34:31 +08:00
|
|
|
assert(!Subtarget.is64Bit() &&
|
2008-09-30 08:58:23 +08:00
|
|
|
"X86-64 PIC uses RIP relative addressing");
|
|
|
|
|
|
|
|
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
|
|
|
|
unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
|
|
|
|
if (GlobalBaseReg != 0)
|
|
|
|
return GlobalBaseReg;
|
|
|
|
|
2010-07-10 17:00:22 +08:00
|
|
|
// Create the register. The code to initialize it is inserted
|
|
|
|
// later, by the CGBR pass (below).
|
2008-09-24 02:22:58 +08:00
|
|
|
MachineRegisterInfo &RegInfo = MF->getRegInfo();
|
2012-05-21 02:43:00 +08:00
|
|
|
GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
|
2008-09-30 08:58:23 +08:00
|
|
|
X86FI->setGlobalBaseReg(GlobalBaseReg);
|
|
|
|
return GlobalBaseReg;
|
2008-09-24 02:22:58 +08:00
|
|
|
}
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2010-03-30 07:24:21 +08:00
|
|
|
// These are the replaceable SSE instructions. Some of these have Int variants
|
|
|
|
// that we don't include here. We don't want to replace instructions selected
|
|
|
|
// by intrinsics.
|
2012-03-09 15:45:21 +08:00
|
|
|
static const uint16_t ReplaceableInstrs[][3] = {
|
2010-08-12 10:08:52 +08:00
|
|
|
//PackedSingle PackedDouble PackedInt
|
2010-03-31 06:46:53 +08:00
|
|
|
{ X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
|
|
|
|
{ X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
|
|
|
|
{ X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
|
|
|
|
{ X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
|
|
|
|
{ X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
|
2015-04-15 23:47:51 +08:00
|
|
|
{ X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
|
2010-03-31 06:46:53 +08:00
|
|
|
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
|
|
|
|
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
|
|
|
|
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
|
|
|
|
{ X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
|
|
|
|
{ X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
|
|
|
|
{ X86::ORPSrm, X86::ORPDrm, X86::PORrm },
|
|
|
|
{ X86::ORPSrr, X86::ORPDrr, X86::PORrr },
|
|
|
|
{ X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
|
|
|
|
{ X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
|
2010-08-13 04:20:53 +08:00
|
|
|
// AVX 128-bit support
|
|
|
|
{ X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
|
|
|
|
{ X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
|
|
|
|
{ X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
|
|
|
|
{ X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
|
|
|
|
{ X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
|
2015-04-18 01:02:37 +08:00
|
|
|
{ X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
|
2010-08-13 04:20:53 +08:00
|
|
|
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
|
|
|
|
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
|
|
|
|
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
|
|
|
|
{ X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm },
|
|
|
|
{ X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr },
|
|
|
|
{ X86::VORPSrm, X86::VORPDrm, X86::VPORrm },
|
|
|
|
{ X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
|
|
|
|
{ X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
|
|
|
|
{ X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
|
2011-07-15 02:50:58 +08:00
|
|
|
// AVX 256-bit support
|
|
|
|
{ X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
|
|
|
|
{ X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
|
|
|
|
{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
|
|
|
|
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
|
|
|
|
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
|
2011-11-15 13:55:35 +08:00
|
|
|
{ X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }
|
|
|
|
};
|
|
|
|
|
2012-03-09 15:45:21 +08:00
|
|
|
static const uint16_t ReplaceableInstrsAVX2[][3] = {
|
2011-11-15 13:55:35 +08:00
|
|
|
//PackedSingle PackedDouble PackedInt
|
2011-11-09 17:37:21 +08:00
|
|
|
{ X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
|
|
|
|
{ X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
|
|
|
|
{ X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
|
|
|
|
{ X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr },
|
|
|
|
{ X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
|
|
|
|
{ X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
|
|
|
|
{ X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
|
2011-11-29 13:37:58 +08:00
|
|
|
{ X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
|
|
|
|
{ X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
|
|
|
|
{ X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
|
|
|
|
{ X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
|
|
|
|
{ X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
|
|
|
|
{ X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
|
[X86] Add broadcast instructions to the table used by ExeDepsFix pass.
Adds the different broadcast instructions to the ReplaceableInstrsAVX2 table.
That way the ExeDepsFix pass can take better decisions when AVX2 broadcasts are
across domain (int <-> float).
In particular, prior to this patch we were generating:
vpbroadcastd LCPI1_0(%rip), %ymm2
vpand %ymm2, %ymm0, %ymm0
vmaxps %ymm1, %ymm0, %ymm0 ## <- domain change penalty
Now, we generate the following nice sequence where everything is in the float
domain:
vbroadcastss LCPI1_0(%rip), %ymm2
vandps %ymm2, %ymm0, %ymm0
vmaxps %ymm1, %ymm0, %ymm0
<rdar://problem/16354675>
llvm-svn: 204770
2014-03-26 08:10:22 +08:00
|
|
|
{ X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
|
|
|
|
{ X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
|
|
|
|
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
|
|
|
|
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
|
|
|
|
{ X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
|
|
|
|
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
|
|
|
|
{ X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
|
2010-03-30 07:24:21 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
// FIXME: Some shuffle and unpack instructions have equivalents in different
|
|
|
|
// domains, but they require a bit more work than just switching opcodes.
|
|
|
|
|
2012-03-09 15:45:21 +08:00
|
|
|
static const uint16_t *lookup(unsigned opcode, unsigned domain) {
|
2010-03-30 07:24:21 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
|
|
|
|
if (ReplaceableInstrs[i][domain-1] == opcode)
|
|
|
|
return ReplaceableInstrs[i];
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2011-11-15 14:39:01 +08:00
|
|
|
}
|
|
|
|
|
2012-03-09 15:45:21 +08:00
|
|
|
static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
|
2011-11-15 14:39:01 +08:00
|
|
|
for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
|
|
|
|
if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
|
|
|
|
return ReplaceableInstrsAVX2[i];
|
2014-04-25 13:30:21 +08:00
|
|
|
return nullptr;
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2010-03-30 07:24:21 +08:00
|
|
|
std::pair<uint16_t, uint16_t>
|
2011-09-28 06:57:18 +08:00
|
|
|
X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
|
2010-03-30 07:24:21 +08:00
|
|
|
uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
|
2014-06-11 06:34:31 +08:00
|
|
|
bool hasAVX2 = Subtarget.hasAVX2();
|
2011-11-15 14:39:01 +08:00
|
|
|
uint16_t validDomains = 0;
|
|
|
|
if (domain && lookup(MI->getOpcode(), domain))
|
|
|
|
validDomains = 0xe;
|
|
|
|
else if (domain && lookupAVX2(MI->getOpcode(), domain))
|
|
|
|
validDomains = hasAVX2 ? 0xe : 0x6;
|
|
|
|
return std::make_pair(domain, validDomains);
|
2010-03-30 07:24:21 +08:00
|
|
|
}
|
2010-03-26 01:25:00 +08:00
|
|
|
|
2011-09-28 06:57:18 +08:00
|
|
|
void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
|
2010-03-30 07:24:21 +08:00
|
|
|
assert(Domain>0 && Domain<4 && "Invalid execution domain");
|
|
|
|
uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
|
|
|
|
assert(dom && "Not an SSE instruction");
|
2012-03-09 15:45:21 +08:00
|
|
|
const uint16_t *table = lookup(MI->getOpcode(), dom);
|
2011-11-23 12:03:08 +08:00
|
|
|
if (!table) { // try the other table
|
2014-06-11 06:34:31 +08:00
|
|
|
assert((Subtarget.hasAVX2() || Domain < 3) &&
|
2011-11-23 12:03:08 +08:00
|
|
|
"256-bit vector operations only available in AVX2");
|
2011-11-15 14:39:01 +08:00
|
|
|
table = lookupAVX2(MI->getOpcode(), dom);
|
2011-11-23 12:03:08 +08:00
|
|
|
}
|
2010-03-30 07:24:21 +08:00
|
|
|
assert(table && "Cannot change domain");
|
|
|
|
MI->setDesc(get(table[Domain-1]));
|
2010-03-26 01:25:00 +08:00
|
|
|
}
|
2010-04-27 07:37:21 +08:00
|
|
|
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Return the noop instruction to use for a noop.
|
2010-04-27 07:37:21 +08:00
|
|
|
void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
|
|
|
|
NopInst.setOpcode(X86::NOOP);
|
|
|
|
}
|
2010-07-10 17:00:22 +08:00
|
|
|
|
2014-11-12 05:08:02 +08:00
|
|
|
// This code must remain in sync with getJumpInstrTableEntryBound in this class!
|
|
|
|
// In particular, getJumpInstrTableEntryBound must always return an upper bound
|
|
|
|
// on the encoding lengths of the instructions generated by
|
|
|
|
// getUnconditionalBranch and getTrap.
|
2014-06-06 03:29:43 +08:00
|
|
|
void X86InstrInfo::getUnconditionalBranch(
|
|
|
|
MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
|
2015-01-06 12:23:53 +08:00
|
|
|
Branch.setOpcode(X86::JMP_1);
|
2015-05-14 02:37:00 +08:00
|
|
|
Branch.addOperand(MCOperand::createExpr(BranchTarget));
|
2014-06-06 03:29:43 +08:00
|
|
|
}
|
|
|
|
|
2014-11-12 05:08:02 +08:00
|
|
|
// This code must remain in sync with getJumpInstrTableEntryBound in this class!
|
|
|
|
// In particular, getJumpInstrTableEntryBound must always return an upper bound
|
|
|
|
// on the encoding lengths of the instructions generated by
|
|
|
|
// getUnconditionalBranch and getTrap.
|
2014-06-06 03:29:43 +08:00
|
|
|
void X86InstrInfo::getTrap(MCInst &MI) const {
|
|
|
|
MI.setOpcode(X86::TRAP);
|
|
|
|
}
|
|
|
|
|
2014-11-12 05:08:02 +08:00
|
|
|
// See getTrap and getUnconditionalBranch for conditions on the value returned
|
|
|
|
// by this function.
|
|
|
|
unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
|
|
|
|
// 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4
|
|
|
|
// bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B).
|
|
|
|
return 5;
|
|
|
|
}
|
|
|
|
|
2011-03-05 16:00:22 +08:00
|
|
|
bool X86InstrInfo::isHighLatencyDef(int opc) const {
|
|
|
|
switch (opc) {
|
2010-10-20 02:58:51 +08:00
|
|
|
default: return false;
|
|
|
|
case X86::DIVSDrm:
|
|
|
|
case X86::DIVSDrm_Int:
|
|
|
|
case X86::DIVSDrr:
|
|
|
|
case X86::DIVSDrr_Int:
|
|
|
|
case X86::DIVSSrm:
|
|
|
|
case X86::DIVSSrm_Int:
|
|
|
|
case X86::DIVSSrr:
|
|
|
|
case X86::DIVSSrr_Int:
|
|
|
|
case X86::SQRTPDm:
|
|
|
|
case X86::SQRTPDr:
|
|
|
|
case X86::SQRTPSm:
|
|
|
|
case X86::SQRTPSr:
|
|
|
|
case X86::SQRTSDm:
|
|
|
|
case X86::SQRTSDm_Int:
|
|
|
|
case X86::SQRTSDr:
|
|
|
|
case X86::SQRTSDr_Int:
|
|
|
|
case X86::SQRTSSm:
|
|
|
|
case X86::SQRTSSm_Int:
|
|
|
|
case X86::SQRTSSr:
|
|
|
|
case X86::SQRTSSr_Int:
|
2011-09-16 06:15:52 +08:00
|
|
|
// AVX instructions with high latency
|
|
|
|
case X86::VDIVSDrm:
|
|
|
|
case X86::VDIVSDrm_Int:
|
|
|
|
case X86::VDIVSDrr:
|
|
|
|
case X86::VDIVSDrr_Int:
|
|
|
|
case X86::VDIVSSrm:
|
|
|
|
case X86::VDIVSSrm_Int:
|
|
|
|
case X86::VDIVSSrr:
|
|
|
|
case X86::VDIVSSrr_Int:
|
|
|
|
case X86::VSQRTPDm:
|
|
|
|
case X86::VSQRTPDr:
|
|
|
|
case X86::VSQRTPSm:
|
|
|
|
case X86::VSQRTPSr:
|
|
|
|
case X86::VSQRTSDm:
|
|
|
|
case X86::VSQRTSDm_Int:
|
|
|
|
case X86::VSQRTSDr:
|
|
|
|
case X86::VSQRTSSm:
|
|
|
|
case X86::VSQRTSSm_Int:
|
|
|
|
case X86::VSQRTSSr:
|
2014-10-29 02:22:41 +08:00
|
|
|
case X86::VSQRTPDZm:
|
|
|
|
case X86::VSQRTPDZr:
|
|
|
|
case X86::VSQRTPSZm:
|
|
|
|
case X86::VSQRTPSZr:
|
2013-09-02 15:41:01 +08:00
|
|
|
case X86::VSQRTSDZm:
|
|
|
|
case X86::VSQRTSDZm_Int:
|
|
|
|
case X86::VSQRTSDZr:
|
|
|
|
case X86::VSQRTSSZm_Int:
|
|
|
|
case X86::VSQRTSSZr:
|
|
|
|
case X86::VSQRTSSZm:
|
|
|
|
case X86::VDIVSDZrm:
|
|
|
|
case X86::VDIVSDZrr:
|
|
|
|
case X86::VDIVSSZrm:
|
|
|
|
case X86::VDIVSSZrr:
|
2013-09-02 15:12:29 +08:00
|
|
|
|
|
|
|
case X86::VGATHERQPSZrm:
|
|
|
|
case X86::VGATHERQPDZrm:
|
|
|
|
case X86::VGATHERDPDZrm:
|
|
|
|
case X86::VGATHERDPSZrm:
|
|
|
|
case X86::VPGATHERQDZrm:
|
|
|
|
case X86::VPGATHERQQZrm:
|
|
|
|
case X86::VPGATHERDDZrm:
|
2013-09-02 15:41:01 +08:00
|
|
|
case X86::VPGATHERDQZrm:
|
|
|
|
case X86::VSCATTERQPDZmr:
|
|
|
|
case X86::VSCATTERQPSZmr:
|
|
|
|
case X86::VSCATTERDPDZmr:
|
|
|
|
case X86::VSCATTERDPSZmr:
|
|
|
|
case X86::VPSCATTERQDZmr:
|
|
|
|
case X86::VPSCATTERQQZmr:
|
|
|
|
case X86::VPSCATTERDDZmr:
|
|
|
|
case X86::VPSCATTERDQZmr:
|
2010-10-20 02:58:51 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-05 16:00:22 +08:00
|
|
|
bool X86InstrInfo::
|
|
|
|
hasHighOperandLatency(const InstrItineraryData *ItinData,
|
|
|
|
const MachineRegisterInfo *MRI,
|
|
|
|
const MachineInstr *DefMI, unsigned DefIdx,
|
|
|
|
const MachineInstr *UseMI, unsigned UseIdx) const {
|
|
|
|
return isHighLatencyDef(DefMI->getOpcode());
|
|
|
|
}
|
|
|
|
|
2010-07-10 17:00:22 +08:00
|
|
|
namespace {
|
2015-02-18 05:55:20 +08:00
|
|
|
/// Create Global Base Reg pass. This initializes the PIC
|
2010-07-10 17:00:22 +08:00
|
|
|
/// global base register for x86-32.
|
|
|
|
struct CGBR : public MachineFunctionPass {
|
|
|
|
static char ID;
|
2010-08-07 02:33:48 +08:00
|
|
|
CGBR() : MachineFunctionPass(ID) {}
|
2010-07-10 17:00:22 +08:00
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
2010-07-10 17:00:22 +08:00
|
|
|
const X86TargetMachine *TM =
|
|
|
|
static_cast<const X86TargetMachine *>(&MF.getTarget());
|
2015-02-03 01:38:43 +08:00
|
|
|
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
|
2010-07-10 17:00:22 +08:00
|
|
|
|
2014-05-22 09:46:02 +08:00
|
|
|
// Don't do anything if this is 64-bit as 64-bit PIC
|
|
|
|
// uses RIP relative addressing.
|
2015-02-03 01:38:43 +08:00
|
|
|
if (STI.is64Bit())
|
2014-05-22 09:46:02 +08:00
|
|
|
return false;
|
2010-07-10 17:00:22 +08:00
|
|
|
|
|
|
|
// Only emit a global base reg in PIC mode.
|
|
|
|
if (TM->getRelocationModel() != Reloc::PIC_)
|
|
|
|
return false;
|
|
|
|
|
2010-09-18 04:24:24 +08:00
|
|
|
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
|
|
|
|
unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
|
|
|
|
|
|
|
|
// If we didn't need a GlobalBaseReg, don't insert code.
|
|
|
|
if (GlobalBaseReg == 0)
|
|
|
|
return false;
|
|
|
|
|
2010-07-10 17:00:22 +08:00
|
|
|
// Insert the set of GlobalBaseReg into the first MBB of the function
|
|
|
|
MachineBasicBlock &FirstMBB = MF.front();
|
|
|
|
MachineBasicBlock::iterator MBBI = FirstMBB.begin();
|
|
|
|
DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
|
|
|
|
MachineRegisterInfo &RegInfo = MF.getRegInfo();
|
2015-02-03 01:38:43 +08:00
|
|
|
const X86InstrInfo *TII = STI.getInstrInfo();
|
2010-07-10 17:00:22 +08:00
|
|
|
|
|
|
|
unsigned PC;
|
2015-02-03 01:38:43 +08:00
|
|
|
if (STI.isPICStyleGOT())
|
2012-04-20 14:31:50 +08:00
|
|
|
PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
|
2010-07-10 17:00:22 +08:00
|
|
|
else
|
2010-09-18 04:24:24 +08:00
|
|
|
PC = GlobalBaseReg;
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2010-07-10 17:00:22 +08:00
|
|
|
// Operand of MovePCtoStack is completely ignored by asm printer. It's
|
|
|
|
// only used in JIT code emission as displacement to pc.
|
|
|
|
BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2010-07-10 17:00:22 +08:00
|
|
|
// If we're using vanilla 'GOT' PIC style, we should use relative addressing
|
|
|
|
// not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
|
2015-02-03 01:38:43 +08:00
|
|
|
if (STI.isPICStyleGOT()) {
|
2010-07-10 17:00:22 +08:00
|
|
|
// Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
|
|
|
|
BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
|
|
|
|
.addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
|
|
|
|
X86II::MO_GOT_ABSOLUTE_ADDRESS);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
const char *getPassName() const override {
|
2010-07-10 17:00:22 +08:00
|
|
|
return "X86 PIC Global Base Reg Initialization";
|
|
|
|
}
|
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
2010-07-10 17:00:22 +08:00
|
|
|
AU.setPreservesCFG();
|
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
char CGBR::ID = 0;
|
|
|
|
FunctionPass*
|
2014-05-22 09:45:57 +08:00
|
|
|
llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
|
2012-06-02 00:27:21 +08:00
|
|
|
|
|
|
|
namespace {
|
|
|
|
struct LDTLSCleanup : public MachineFunctionPass {
|
|
|
|
static char ID;
|
|
|
|
LDTLSCleanup() : MachineFunctionPass(ID) {}
|
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
2012-06-02 00:27:21 +08:00
|
|
|
X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
|
|
|
|
if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
|
|
|
|
// No point folding accesses if there isn't at least two.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
|
|
|
|
return VisitNode(DT->getRootNode(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Visit the dominator subtree rooted at Node in pre-order.
|
|
|
|
// If TLSBaseAddrReg is non-null, then use that to replace any
|
|
|
|
// TLS_base_addr instructions. Otherwise, create the register
|
|
|
|
// when the first such instruction is seen, and then use it
|
|
|
|
// as we encounter more instructions.
|
|
|
|
bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
|
|
|
|
MachineBasicBlock *BB = Node->getBlock();
|
|
|
|
bool Changed = false;
|
|
|
|
|
|
|
|
// Traverse the current block.
|
|
|
|
for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
|
|
|
|
++I) {
|
|
|
|
switch (I->getOpcode()) {
|
|
|
|
case X86::TLS_base_addr32:
|
|
|
|
case X86::TLS_base_addr64:
|
|
|
|
if (TLSBaseAddrReg)
|
|
|
|
I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
|
|
|
|
else
|
|
|
|
I = SetRegister(I, &TLSBaseAddrReg);
|
|
|
|
Changed = true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Visit the children of this block in the dominator tree.
|
|
|
|
for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
|
|
|
|
I != E; ++I) {
|
|
|
|
Changed |= VisitNode(*I, TLSBaseAddrReg);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace the TLS_base_addr instruction I with a copy from
|
|
|
|
// TLSBaseAddrReg, returning the new instruction.
|
|
|
|
MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
|
|
|
|
unsigned TLSBaseAddrReg) {
|
|
|
|
MachineFunction *MF = I->getParent()->getParent();
|
2015-02-03 01:38:43 +08:00
|
|
|
const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
|
|
|
|
const bool is64Bit = STI.is64Bit();
|
|
|
|
const X86InstrInfo *TII = STI.getInstrInfo();
|
2012-06-02 00:27:21 +08:00
|
|
|
|
|
|
|
// Insert a Copy from TLSBaseAddrReg to RAX/EAX.
|
|
|
|
MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
|
|
|
|
TII->get(TargetOpcode::COPY),
|
|
|
|
is64Bit ? X86::RAX : X86::EAX)
|
|
|
|
.addReg(TLSBaseAddrReg);
|
|
|
|
|
|
|
|
// Erase the TLS_base_addr instruction.
|
|
|
|
I->eraseFromParent();
|
|
|
|
|
|
|
|
return Copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a virtal register in *TLSBaseAddrReg, and populate it by
|
|
|
|
// inserting a copy instruction after I. Returns the new instruction.
|
|
|
|
MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
|
|
|
|
MachineFunction *MF = I->getParent()->getParent();
|
2015-02-03 01:38:43 +08:00
|
|
|
const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
|
|
|
|
const bool is64Bit = STI.is64Bit();
|
|
|
|
const X86InstrInfo *TII = STI.getInstrInfo();
|
2012-06-02 00:27:21 +08:00
|
|
|
|
|
|
|
// Create a virtual register for the TLS base address.
|
|
|
|
MachineRegisterInfo &RegInfo = MF->getRegInfo();
|
|
|
|
*TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
|
|
|
|
? &X86::GR64RegClass
|
|
|
|
: &X86::GR32RegClass);
|
|
|
|
|
|
|
|
// Insert a copy from RAX/EAX to TLSBaseAddrReg.
|
|
|
|
MachineInstr *Next = I->getNextNode();
|
|
|
|
MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
|
|
|
|
TII->get(TargetOpcode::COPY),
|
|
|
|
*TLSBaseAddrReg)
|
|
|
|
.addReg(is64Bit ? X86::RAX : X86::EAX);
|
|
|
|
|
|
|
|
return Copy;
|
|
|
|
}
|
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
const char *getPassName() const override {
|
2012-06-02 00:27:21 +08:00
|
|
|
return "Local Dynamic TLS Access Clean-up";
|
|
|
|
}
|
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
2012-06-02 00:27:21 +08:00
|
|
|
AU.setPreservesCFG();
|
|
|
|
AU.addRequired<MachineDominatorTree>();
|
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
char LDTLSCleanup::ID = 0;
|
|
|
|
FunctionPass*
|
|
|
|
llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
|