forked from OSchip/llvm-project
Making use of VFP / NEON floating point multiply-accumulate / subtraction is
difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. llvm-svn: 120960
This commit is contained in:
parent
a3fb8cb3d4
commit
62c7b5bf76
|
@ -49,6 +49,7 @@ FunctionPass *createARMExpandPseudoPass();
|
||||||
FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
|
FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
|
||||||
FunctionPass *createARMConstantIslandPass();
|
FunctionPass *createARMConstantIslandPass();
|
||||||
FunctionPass *createNEONMoveFixPass();
|
FunctionPass *createNEONMoveFixPass();
|
||||||
|
FunctionPass *createMLxExpansionPass();
|
||||||
FunctionPass *createThumb2ITBlockPass();
|
FunctionPass *createThumb2ITBlockPass();
|
||||||
FunctionPass *createThumb2SizeReductionPass();
|
FunctionPass *createThumb2SizeReductionPass();
|
||||||
|
|
||||||
|
|
|
@ -46,14 +46,11 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
|
||||||
def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
|
def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
|
||||||
"Floating point unit supports single precision only">;
|
"Floating point unit supports single precision only">;
|
||||||
|
|
||||||
// Some processors have multiply-accumulate instructions that don't
|
// Some processors have FP multiply-accumulate instructions that don't
|
||||||
// play nicely with other VFP instructions, and it's generally better
|
// play nicely with other VFP / NEON instructions, and it's generally better
|
||||||
// to just not use them.
|
// to just not use them.
|
||||||
// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for
|
def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
|
||||||
// others as well. We should do more benchmarking and confirm one way or
|
"Disable VFP / NEON MAC instructions">;
|
||||||
// the other.
|
|
||||||
def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true",
|
|
||||||
"Disable VFP MAC instructions">;
|
|
||||||
// Some processors benefit from using NEON instructions for scalar
|
// Some processors benefit from using NEON instructions for scalar
|
||||||
// single-precision FP operations.
|
// single-precision FP operations.
|
||||||
def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
|
def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
|
||||||
|
@ -150,26 +147,29 @@ def : ProcNoItin<"iwmmxt", [ArchV5TE]>;
|
||||||
// V6 Processors.
|
// V6 Processors.
|
||||||
def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>;
|
def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>;
|
||||||
def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2,
|
def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2,
|
||||||
FeatureHasSlowVMLx]>;
|
FeatureHasSlowFPVMLx]>;
|
||||||
def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>;
|
def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>;
|
||||||
def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
|
def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2,
|
||||||
|
FeatureHasSlowFPVMLx]>;
|
||||||
def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>;
|
def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>;
|
||||||
def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
|
def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2,
|
||||||
|
FeatureHasSlowFPVMLx]>;
|
||||||
|
|
||||||
// V6M Processors.
|
// V6M Processors.
|
||||||
def : Processor<"cortex-m0", ARMV6Itineraries, [ArchV6M]>;
|
def : Processor<"cortex-m0", ARMV6Itineraries, [ArchV6M]>;
|
||||||
|
|
||||||
// V6T2 Processors.
|
// V6T2 Processors.
|
||||||
def : Processor<"arm1156t2-s", ARMV6Itineraries, [ArchV6T2]>;
|
def : Processor<"arm1156t2-s", ARMV6Itineraries, [ArchV6T2]>;
|
||||||
def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2]>;
|
def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2,
|
||||||
|
FeatureHasSlowFPVMLx]>;
|
||||||
|
|
||||||
// V7 Processors.
|
// V7 Processors.
|
||||||
def : Processor<"cortex-a8", CortexA8Itineraries,
|
def : Processor<"cortex-a8", CortexA8Itineraries,
|
||||||
[ArchV7A, ProcA8,
|
[ArchV7A, ProcA8,
|
||||||
FeatureHasSlowVMLx, FeatureT2XtPk]>;
|
FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
|
||||||
def : Processor<"cortex-a9", CortexA9Itineraries,
|
def : Processor<"cortex-a9", CortexA9Itineraries,
|
||||||
[ArchV7A, ProcA9,
|
[ArchV7A, ProcA9,
|
||||||
FeatureHasSlowVMLx, FeatureT2XtPk]>;
|
FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
|
||||||
|
|
||||||
// V7M Processors.
|
// V7M Processors.
|
||||||
def : ProcNoItin<"cortex-m3", [ArchV7M]>;
|
def : ProcNoItin<"cortex-m3", [ArchV7M]>;
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include "ARM.h"
|
#include "ARM.h"
|
||||||
#include "ARMAddressingModes.h"
|
#include "ARMAddressingModes.h"
|
||||||
#include "ARMConstantPoolValue.h"
|
#include "ARMConstantPoolValue.h"
|
||||||
|
#include "ARMHazardRecognizer.h"
|
||||||
#include "ARMMachineFunctionInfo.h"
|
#include "ARMMachineFunctionInfo.h"
|
||||||
#include "ARMRegisterInfo.h"
|
#include "ARMRegisterInfo.h"
|
||||||
#include "ARMGenInstrInfo.inc"
|
#include "ARMGenInstrInfo.inc"
|
||||||
|
@ -40,9 +41,58 @@ static cl::opt<bool>
|
||||||
EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
|
EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
|
||||||
cl::desc("Enable ARM 2-addr to 3-addr conv"));
|
cl::desc("Enable ARM 2-addr to 3-addr conv"));
|
||||||
|
|
||||||
|
|
||||||
|
/// ARM_MLxEntry - Record information about MLA / MLS instructions.
|
||||||
|
struct ARM_MLxEntry {
|
||||||
|
unsigned MLxOpc; // MLA / MLS opcode
|
||||||
|
unsigned MulOpc; // Expanded multiplication opcode
|
||||||
|
unsigned AddSubOpc; // Expanded add / sub opcode
|
||||||
|
bool NegAcc; // True if the acc is negated before the add / sub.
|
||||||
|
bool HasLane; // True if instruction has an extra "lane" operand.
|
||||||
|
};
|
||||||
|
|
||||||
|
static const ARM_MLxEntry ARM_MLxTable[] = {
|
||||||
|
// MLxOpc, MulOpc, AddSubOpc, NegAcc, HasLane
|
||||||
|
// fp scalar ops
|
||||||
|
{ ARM::VMLAS, ARM::VMULS, ARM::VADDS, false, false },
|
||||||
|
{ ARM::VMLSS, ARM::VMULS, ARM::VSUBS, false, false },
|
||||||
|
{ ARM::VMLAD, ARM::VMULD, ARM::VADDD, false, false },
|
||||||
|
{ ARM::VMLSD, ARM::VMULD, ARM::VSUBD, false, false },
|
||||||
|
{ ARM::VMLAfd_sfp, ARM::VMULfd_sfp, ARM::VADDfd_sfp, false, false },
|
||||||
|
{ ARM::VMLSfd_sfp, ARM::VMULfd_sfp, ARM::VSUBfd_sfp, false, false },
|
||||||
|
{ ARM::VNMLAS, ARM::VNMULS, ARM::VSUBS, true, false },
|
||||||
|
{ ARM::VNMLSS, ARM::VMULS, ARM::VSUBS, true, false },
|
||||||
|
{ ARM::VNMLAD, ARM::VNMULD, ARM::VSUBD, true, false },
|
||||||
|
{ ARM::VNMLSD, ARM::VMULD, ARM::VSUBD, true, false },
|
||||||
|
|
||||||
|
// fp SIMD ops
|
||||||
|
{ ARM::VMLAfd, ARM::VMULfd, ARM::VADDfd, false, false },
|
||||||
|
{ ARM::VMLSfd, ARM::VMULfd, ARM::VSUBfd, false, false },
|
||||||
|
{ ARM::VMLAfq, ARM::VMULfq, ARM::VADDfq, false, false },
|
||||||
|
{ ARM::VMLSfq, ARM::VMULfq, ARM::VSUBfq, false, false },
|
||||||
|
{ ARM::VMLAslfd, ARM::VMULslfd, ARM::VADDfd, false, true },
|
||||||
|
{ ARM::VMLSslfd, ARM::VMULslfd, ARM::VSUBfd, false, true },
|
||||||
|
{ ARM::VMLAslfq, ARM::VMULslfq, ARM::VADDfq, false, true },
|
||||||
|
{ ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true },
|
||||||
|
};
|
||||||
|
|
||||||
ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
|
ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
|
||||||
: TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)),
|
: TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)),
|
||||||
Subtarget(STI) {
|
Subtarget(STI) {
|
||||||
|
for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
|
||||||
|
if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
|
||||||
|
assert(false && "Duplicated entries?");
|
||||||
|
MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
|
||||||
|
MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ScheduleHazardRecognizer *ARMBaseInstrInfo::
|
||||||
|
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const {
|
||||||
|
if (Subtarget.isThumb2() || Subtarget.hasVFP2())
|
||||||
|
return (ScheduleHazardRecognizer *)
|
||||||
|
new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget);
|
||||||
|
return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II);
|
||||||
}
|
}
|
||||||
|
|
||||||
MachineInstr *
|
MachineInstr *
|
||||||
|
@ -197,7 +247,6 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
|
||||||
return NewMIs[0];
|
return NewMIs[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Branch analysis.
|
// Branch analysis.
|
||||||
bool
|
bool
|
||||||
ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
|
ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
|
||||||
|
@ -2196,3 +2245,19 @@ hasLowDefLatency(const InstrItineraryData *ItinData,
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
|
||||||
|
unsigned &AddSubOpc,
|
||||||
|
bool &NegAcc, bool &HasLane) const {
|
||||||
|
DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode);
|
||||||
|
if (I == MLxEntryMap.end())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const ARM_MLxEntry &Entry = ARM_MLxTable[I->second];
|
||||||
|
MulOpc = Entry.MulOpc;
|
||||||
|
AddSubOpc = Entry.AddSubOpc;
|
||||||
|
NegAcc = Entry.NegAcc;
|
||||||
|
HasLane = Entry.HasLane;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
#include "ARM.h"
|
#include "ARM.h"
|
||||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||||
#include "llvm/Target/TargetInstrInfo.h"
|
#include "llvm/Target/TargetInstrInfo.h"
|
||||||
|
#include "llvm/ADT/DenseMap.h"
|
||||||
|
#include "llvm/ADT/SmallSet.h"
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
class ARMSubtarget;
|
class ARMSubtarget;
|
||||||
|
@ -191,9 +193,11 @@ namespace ARMII {
|
||||||
|
|
||||||
class ARMBaseInstrInfo : public TargetInstrInfoImpl {
|
class ARMBaseInstrInfo : public TargetInstrInfoImpl {
|
||||||
const ARMSubtarget &Subtarget;
|
const ARMSubtarget &Subtarget;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Can be only subclassed.
|
// Can be only subclassed.
|
||||||
explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
|
explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Return the non-pre/post incrementing version of 'Opc'. Return 0
|
// Return the non-pre/post incrementing version of 'Opc'. Return 0
|
||||||
// if there is not such an opcode.
|
// if there is not such an opcode.
|
||||||
|
@ -206,7 +210,9 @@ public:
|
||||||
virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
|
virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
|
||||||
const ARMSubtarget &getSubtarget() const { return Subtarget; }
|
const ARMSubtarget &getSubtarget() const { return Subtarget; }
|
||||||
|
|
||||||
public:
|
ScheduleHazardRecognizer *
|
||||||
|
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const;
|
||||||
|
|
||||||
// Branch analysis.
|
// Branch analysis.
|
||||||
virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
|
||||||
MachineBasicBlock *&FBB,
|
MachineBasicBlock *&FBB,
|
||||||
|
@ -393,6 +399,38 @@ private:
|
||||||
const MachineInstr *UseMI, unsigned UseIdx) const;
|
const MachineInstr *UseMI, unsigned UseIdx) const;
|
||||||
bool hasLowDefLatency(const InstrItineraryData *ItinData,
|
bool hasLowDefLatency(const InstrItineraryData *ItinData,
|
||||||
const MachineInstr *DefMI, unsigned DefIdx) const;
|
const MachineInstr *DefMI, unsigned DefIdx) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// Modeling special VFP / NEON fp MLA / MLS hazards.
|
||||||
|
|
||||||
|
/// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal
|
||||||
|
/// MLx table.
|
||||||
|
DenseMap<unsigned, unsigned> MLxEntryMap;
|
||||||
|
|
||||||
|
/// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause
|
||||||
|
/// stalls when scheduled together with fp MLA / MLS opcodes.
|
||||||
|
SmallSet<unsigned, 16> MLxHazardOpcodes;
|
||||||
|
|
||||||
|
public:
|
||||||
|
/// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS
|
||||||
|
/// instruction.
|
||||||
|
bool isFpMLxInstruction(unsigned Opcode) const {
|
||||||
|
return MLxEntryMap.count(Opcode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// isFpMLxInstruction - This version also returns the multiply opcode and the
|
||||||
|
/// addition / subtraction opcode to expand to. Return true for 'HasLane' for
|
||||||
|
/// the MLX instructions with an extra lane operand.
|
||||||
|
bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
|
||||||
|
unsigned &AddSubOpc, bool &NegAcc,
|
||||||
|
bool &HasLane) const;
|
||||||
|
|
||||||
|
/// canCauseFpMLxStall - Return true if an instruction of the specified opcode
|
||||||
|
/// will cause stalls when scheduled after (within 4-cycle window) a fp
|
||||||
|
/// MLA / MLS instruction.
|
||||||
|
bool canCauseFpMLxStall(unsigned Opcode) const {
|
||||||
|
return MLxHazardOpcodes.count(Opcode);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "ARMHazardRecognizer.h"
|
||||||
|
#include "ARMBaseInstrInfo.h"
|
||||||
|
#include "ARMSubtarget.h"
|
||||||
|
#include "llvm/CodeGen/MachineInstr.h"
|
||||||
|
#include "llvm/CodeGen/ScheduleDAG.h"
|
||||||
|
#include "llvm/Target/TargetRegisterInfo.h"
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
|
||||||
|
const TargetRegisterInfo &TRI) {
|
||||||
|
// FIXME: Detect integer instructions properly.
|
||||||
|
const TargetInstrDesc &TID = MI->getDesc();
|
||||||
|
unsigned Domain = TID.TSFlags & ARMII::DomainMask;
|
||||||
|
if (Domain == ARMII::DomainVFP) {
|
||||||
|
unsigned Opcode = MI->getOpcode();
|
||||||
|
if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
|
||||||
|
Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
|
||||||
|
return false;
|
||||||
|
} else if (Domain == ARMII::DomainNEON) {
|
||||||
|
if (MI->getDesc().mayStore() || MI->getDesc().mayLoad())
|
||||||
|
return false;
|
||||||
|
} else
|
||||||
|
return false;
|
||||||
|
return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
|
||||||
|
}
|
||||||
|
|
||||||
|
ScheduleHazardRecognizer::HazardType
|
||||||
|
ARMHazardRecognizer::getHazardType(SUnit *SU) {
|
||||||
|
MachineInstr *MI = SU->getInstr();
|
||||||
|
|
||||||
|
if (!MI->isDebugValue()) {
|
||||||
|
if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1])
|
||||||
|
return Hazard;
|
||||||
|
|
||||||
|
// Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
|
||||||
|
// a VMLA / VMLS will cause 4 cycle stall.
|
||||||
|
const TargetInstrDesc &TID = MI->getDesc();
|
||||||
|
if (LastMI && (TID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
|
||||||
|
MachineInstr *DefMI = LastMI;
|
||||||
|
const TargetInstrDesc &LastTID = LastMI->getDesc();
|
||||||
|
// Skip over one non-VFP / NEON instruction.
|
||||||
|
if (!LastTID.isBarrier() &&
|
||||||
|
(LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
|
||||||
|
MachineBasicBlock::iterator I = LastMI;
|
||||||
|
if (I != LastMI->getParent()->begin()) {
|
||||||
|
I = llvm::prior(I);
|
||||||
|
DefMI = &*I;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
|
||||||
|
(TII.canCauseFpMLxStall(MI->getOpcode()) ||
|
||||||
|
hasRAWHazard(DefMI, MI, TRI))) {
|
||||||
|
// Try to schedule another instruction for the next 4 cycles.
|
||||||
|
if (Stalls == 0)
|
||||||
|
Stalls = 4;
|
||||||
|
return Hazard;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return PostRAHazardRecognizer::getHazardType(SU);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARMHazardRecognizer::Reset() {
|
||||||
|
LastMI = 0;
|
||||||
|
Stalls = 0;
|
||||||
|
ITBlockSize = 0;
|
||||||
|
PostRAHazardRecognizer::Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
|
||||||
|
MachineInstr *MI = SU->getInstr();
|
||||||
|
unsigned Opcode = MI->getOpcode();
|
||||||
|
if (ITBlockSize) {
|
||||||
|
--ITBlockSize;
|
||||||
|
} else if (Opcode == ARM::t2IT) {
|
||||||
|
unsigned Mask = MI->getOperand(1).getImm();
|
||||||
|
unsigned NumTZ = CountTrailingZeros_32(Mask);
|
||||||
|
assert(NumTZ <= 3 && "Invalid IT mask!");
|
||||||
|
ITBlockSize = 4 - NumTZ;
|
||||||
|
MachineBasicBlock::iterator I = MI;
|
||||||
|
for (unsigned i = 0; i < ITBlockSize; ++i) {
|
||||||
|
// Advance to the next instruction, skipping any dbg_value instructions.
|
||||||
|
do {
|
||||||
|
++I;
|
||||||
|
} while (I->isDebugValue());
|
||||||
|
ITBlockMIs[ITBlockSize-1-i] = &*I;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!MI->isDebugValue()) {
|
||||||
|
LastMI = MI;
|
||||||
|
Stalls = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
PostRAHazardRecognizer::EmitInstruction(SU);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARMHazardRecognizer::AdvanceCycle() {
|
||||||
|
if (Stalls && --Stalls == 0)
|
||||||
|
// Stalled for 4 cycles but still can't schedule any other instructions.
|
||||||
|
LastMI = 0;
|
||||||
|
PostRAHazardRecognizer::AdvanceCycle();
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// This file defines hazard recognizers for scheduling ARM functions.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef ARMHAZARDRECOGNIZER_H
|
||||||
|
#define ARMHAZARDRECOGNIZER_H
|
||||||
|
|
||||||
|
#include "llvm/CodeGen/PostRAHazardRecognizer.h"
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
|
||||||
|
class ARMBaseInstrInfo;
|
||||||
|
class ARMBaseRegisterInfo;
|
||||||
|
class ARMSubtarget;
|
||||||
|
class MachineInstr;
|
||||||
|
|
||||||
|
class ARMHazardRecognizer : public PostRAHazardRecognizer {
|
||||||
|
const ARMBaseInstrInfo &TII;
|
||||||
|
const ARMBaseRegisterInfo &TRI;
|
||||||
|
const ARMSubtarget &STI;
|
||||||
|
|
||||||
|
MachineInstr *LastMI;
|
||||||
|
unsigned Stalls;
|
||||||
|
unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled.
|
||||||
|
MachineInstr *ITBlockMIs[4];
|
||||||
|
|
||||||
|
public:
|
||||||
|
ARMHazardRecognizer(const InstrItineraryData *ItinData,
|
||||||
|
const ARMBaseInstrInfo &tii,
|
||||||
|
const ARMBaseRegisterInfo &tri,
|
||||||
|
const ARMSubtarget &sti) :
|
||||||
|
PostRAHazardRecognizer(ItinData), TII(tii), TRI(tri), STI(sti),
|
||||||
|
LastMI(0), ITBlockSize(0) {}
|
||||||
|
|
||||||
|
virtual HazardType getHazardType(SUnit *SU);
|
||||||
|
virtual void Reset();
|
||||||
|
virtual void EmitInstruction(SUnit *SU);
|
||||||
|
virtual void AdvanceCycle();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
} // end namespace llvm
|
||||||
|
|
||||||
|
#endif // ARMHAZARDRECOGNIZER_H
|
|
@ -13,6 +13,7 @@
|
||||||
|
|
||||||
#define DEBUG_TYPE "arm-isel"
|
#define DEBUG_TYPE "arm-isel"
|
||||||
#include "ARM.h"
|
#include "ARM.h"
|
||||||
|
#include "ARMBaseInstrInfo.h"
|
||||||
#include "ARMAddressingModes.h"
|
#include "ARMAddressingModes.h"
|
||||||
#include "ARMTargetMachine.h"
|
#include "ARMTargetMachine.h"
|
||||||
#include "llvm/CallingConv.h"
|
#include "llvm/CallingConv.h"
|
||||||
|
@ -41,6 +42,11 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
|
||||||
cl::desc("Disable isel of shifter-op"),
|
cl::desc("Disable isel of shifter-op"),
|
||||||
cl::init(false));
|
cl::init(false));
|
||||||
|
|
||||||
|
static cl::opt<bool>
|
||||||
|
CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
|
||||||
|
cl::desc("Check fp vmla / vmls hazard at isel time"),
|
||||||
|
cl::init(false));
|
||||||
|
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
/// ARMDAGToDAGISel - ARM specific code to select ARM machine
|
/// ARMDAGToDAGISel - ARM specific code to select ARM machine
|
||||||
/// instructions for SelectionDAG operations.
|
/// instructions for SelectionDAG operations.
|
||||||
|
@ -54,6 +60,7 @@ enum AddrMode2Type {
|
||||||
|
|
||||||
class ARMDAGToDAGISel : public SelectionDAGISel {
|
class ARMDAGToDAGISel : public SelectionDAGISel {
|
||||||
ARMBaseTargetMachine &TM;
|
ARMBaseTargetMachine &TM;
|
||||||
|
const ARMBaseInstrInfo *TII;
|
||||||
|
|
||||||
/// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
|
/// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
|
||||||
/// make the right decision when generating code for different targets.
|
/// make the right decision when generating code for different targets.
|
||||||
|
@ -63,6 +70,7 @@ public:
|
||||||
explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
|
explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
|
||||||
CodeGenOpt::Level OptLevel)
|
CodeGenOpt::Level OptLevel)
|
||||||
: SelectionDAGISel(tm, OptLevel), TM(tm),
|
: SelectionDAGISel(tm, OptLevel), TM(tm),
|
||||||
|
TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())),
|
||||||
Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
|
Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,6 +86,8 @@ public:
|
||||||
|
|
||||||
SDNode *Select(SDNode *N);
|
SDNode *Select(SDNode *N);
|
||||||
|
|
||||||
|
|
||||||
|
bool hasNoVMLxHazardUse(SDNode *N) const;
|
||||||
bool isShifterOpProfitable(const SDValue &Shift,
|
bool isShifterOpProfitable(const SDValue &Shift,
|
||||||
ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
|
ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
|
||||||
bool SelectShifterOperandReg(SDValue N, SDValue &A,
|
bool SelectShifterOperandReg(SDValue N, SDValue &A,
|
||||||
|
@ -272,6 +282,50 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
|
||||||
isInt32Immediate(N->getOperand(1).getNode(), Imm);
|
isInt32Immediate(N->getOperand(1).getNode(), Imm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
|
||||||
|
/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
|
||||||
|
/// least on current ARM implementations) which should be avoidded.
|
||||||
|
bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
|
||||||
|
if (OptLevel == CodeGenOpt::None)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!CheckVMLxHazard)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9())
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!N->hasOneUse())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
SDNode *Use = *N->use_begin();
|
||||||
|
if (Use->getOpcode() == ISD::CopyToReg)
|
||||||
|
return true;
|
||||||
|
if (Use->isMachineOpcode()) {
|
||||||
|
const TargetInstrDesc &TID = TII->get(Use->getMachineOpcode());
|
||||||
|
if (TID.mayStore())
|
||||||
|
return true;
|
||||||
|
unsigned Opcode = TID.getOpcode();
|
||||||
|
if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
|
||||||
|
return true;
|
||||||
|
// vmlx feeding into another vmlx. We actually want to unfold
|
||||||
|
// the use later in the MLxExpansion pass. e.g.
|
||||||
|
// vmla
|
||||||
|
// vmla (stall 8 cycles)
|
||||||
|
//
|
||||||
|
// vmul (5 cycles)
|
||||||
|
// vadd (5 cycles)
|
||||||
|
// vmla
|
||||||
|
// This adds up to about 18 - 19 cycles.
|
||||||
|
//
|
||||||
|
// vmla
|
||||||
|
// vmul (stall 4 cycles)
|
||||||
|
// vadd adds up to about 14 cycles.
|
||||||
|
return TII->isFpMLxInstruction(Opcode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
|
bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
|
||||||
ARM_AM::ShiftOpc ShOpcVal,
|
ARM_AM::ShiftOpc ShOpcVal,
|
||||||
|
|
|
@ -175,7 +175,7 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
|
||||||
// FIXME: Eventually this will be just "hasV6T2Ops".
|
// FIXME: Eventually this will be just "hasV6T2Ops".
|
||||||
def UseMovt : Predicate<"Subtarget->useMovt()">;
|
def UseMovt : Predicate<"Subtarget->useMovt()">;
|
||||||
def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
|
def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
|
||||||
def UseVMLx : Predicate<"Subtarget->useVMLx()">;
|
def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// ARM Flag Definitions.
|
// ARM Flag Definitions.
|
||||||
|
@ -279,6 +279,21 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
|
||||||
return N->hasOneUse();
|
return N->hasOneUse();
|
||||||
}]>;
|
}]>;
|
||||||
|
|
||||||
|
// An 'fmul' node with a single use.
|
||||||
|
def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
|
||||||
|
return N->hasOneUse();
|
||||||
|
}]>;
|
||||||
|
|
||||||
|
// An 'fadd' node which checks for single non-hazardous use.
|
||||||
|
def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
|
||||||
|
return hasNoVMLxHazardUse(N);
|
||||||
|
}]>;
|
||||||
|
|
||||||
|
// An 'fsub' node which checks for single non-hazardous use.
|
||||||
|
def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
|
||||||
|
return hasNoVMLxHazardUse(N);
|
||||||
|
}]>;
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// Operand Definitions.
|
// Operand Definitions.
|
||||||
//
|
//
|
||||||
|
|
|
@ -1907,7 +1907,7 @@ class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||||
// Multiply-Add/Sub operations: single-, double- and quad-register.
|
// Multiply-Add/Sub operations: single-, double- and quad-register.
|
||||||
class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||||
ValueType Ty, SDNode MulOp, SDNode OpNode>
|
ValueType Ty, SDPatternOperator MulOp, SDNode OpNode>
|
||||||
: N3V<op24, op23, op21_20, op11_8, 0, op4,
|
: N3V<op24, op23, op21_20, op11_8, 0, op4,
|
||||||
(outs DPR_VFP2:$Vd),
|
(outs DPR_VFP2:$Vd),
|
||||||
(ins DPR_VFP2:$src1, DPR_VFP2:$Vn, DPR_VFP2:$Vm), N3RegFrm, itin,
|
(ins DPR_VFP2:$src1, DPR_VFP2:$Vn, DPR_VFP2:$Vm), N3RegFrm, itin,
|
||||||
|
@ -1915,7 +1915,7 @@ class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||||
|
|
||||||
class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||||
InstrItinClass itin, string OpcodeStr, string Dt,
|
InstrItinClass itin, string OpcodeStr, string Dt,
|
||||||
ValueType Ty, SDNode MulOp, SDNode OpNode>
|
ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode>
|
||||||
: N3V<op24, op23, op21_20, op11_8, 0, op4,
|
: N3V<op24, op23, op21_20, op11_8, 0, op4,
|
||||||
(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
|
(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
|
||||||
OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
|
OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
|
||||||
|
@ -1924,7 +1924,7 @@ class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||||
|
|
||||||
class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
|
class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
|
||||||
string OpcodeStr, string Dt,
|
string OpcodeStr, string Dt,
|
||||||
ValueType Ty, SDNode MulOp, SDNode ShOp>
|
ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
|
||||||
: N3V<0, 1, op21_20, op11_8, 1, 0,
|
: N3V<0, 1, op21_20, op11_8, 1, 0,
|
||||||
(outs DPR:$Vd),
|
(outs DPR:$Vd),
|
||||||
(ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
|
(ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
|
||||||
|
@ -1951,7 +1951,7 @@ class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
|
||||||
|
|
||||||
class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||||
InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
|
InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
|
||||||
SDNode MulOp, SDNode OpNode>
|
SDPatternOperator MulOp, SDPatternOperator OpNode>
|
||||||
: N3V<op24, op23, op21_20, op11_8, 1, op4,
|
: N3V<op24, op23, op21_20, op11_8, 1, op4,
|
||||||
(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
|
(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
|
||||||
OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
|
OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
|
||||||
|
@ -1959,7 +1959,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
|
||||||
(Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
|
(Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
|
||||||
class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
|
class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
|
||||||
string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
|
string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
|
||||||
SDNode MulOp, SDNode ShOp>
|
SDPatternOperator MulOp, SDPatternOperator ShOp>
|
||||||
: N3V<1, 1, op21_20, op11_8, 1, 0,
|
: N3V<1, 1, op21_20, op11_8, 1, 0,
|
||||||
(outs QPR:$Vd),
|
(outs QPR:$Vd),
|
||||||
(ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
|
(ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
|
||||||
|
@ -3282,15 +3282,19 @@ defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D,
|
||||||
defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
|
defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
|
||||||
IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
|
IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
|
||||||
def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
|
def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
|
||||||
v2f32, fmul, fadd>;
|
v2f32, fmul_su, fadd_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
|
def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
|
||||||
v4f32, fmul, fadd>;
|
v4f32, fmul_su, fadd_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
|
defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
|
||||||
IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
|
IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
|
||||||
def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
|
def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
|
||||||
v2f32, fmul, fadd>;
|
v2f32, fmul_su, fadd_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
|
def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
|
||||||
v4f32, v2f32, fmul, fadd>;
|
v4f32, v2f32, fmul_su, fadd_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
|
|
||||||
def : Pat<(v8i16 (add (v8i16 QPR:$src1),
|
def : Pat<(v8i16 (add (v8i16 QPR:$src1),
|
||||||
(mul (v8i16 QPR:$src2),
|
(mul (v8i16 QPR:$src2),
|
||||||
|
@ -3308,14 +3312,15 @@ def : Pat<(v4i32 (add (v4i32 QPR:$src1),
|
||||||
(DSubReg_i32_reg imm:$lane))),
|
(DSubReg_i32_reg imm:$lane))),
|
||||||
(SubReg_i32_lane imm:$lane)))>;
|
(SubReg_i32_lane imm:$lane)))>;
|
||||||
|
|
||||||
def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
|
def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
|
||||||
(fmul (v4f32 QPR:$src2),
|
(fmul_su (v4f32 QPR:$src2),
|
||||||
(v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
|
(v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
|
||||||
(v4f32 (VMLAslfq (v4f32 QPR:$src1),
|
(v4f32 (VMLAslfq (v4f32 QPR:$src1),
|
||||||
(v4f32 QPR:$src2),
|
(v4f32 QPR:$src2),
|
||||||
(v2f32 (EXTRACT_SUBREG QPR:$src3,
|
(v2f32 (EXTRACT_SUBREG QPR:$src3,
|
||||||
(DSubReg_i32_reg imm:$lane))),
|
(DSubReg_i32_reg imm:$lane))),
|
||||||
(SubReg_i32_lane imm:$lane)))>;
|
(SubReg_i32_lane imm:$lane)))>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
|
|
||||||
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
|
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
|
||||||
defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||||
|
@ -3335,15 +3340,19 @@ defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
|
||||||
defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
|
defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
|
||||||
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
|
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
|
||||||
def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
|
def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
|
||||||
v2f32, fmul, fsub>;
|
v2f32, fmul_su, fsub_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
|
def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
|
||||||
v4f32, fmul, fsub>;
|
v4f32, fmul_su, fsub_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
|
defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
|
||||||
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
|
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
|
||||||
def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
|
def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
|
||||||
v2f32, fmul, fsub>;
|
v2f32, fmul_su, fsub_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
|
def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
|
||||||
v4f32, v2f32, fmul, fsub>;
|
v4f32, v2f32, fmul_su, fsub_mlx>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
|
|
||||||
def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
|
def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
|
||||||
(mul (v8i16 QPR:$src2),
|
(mul (v8i16 QPR:$src2),
|
||||||
|
@ -3361,13 +3370,14 @@ def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
|
||||||
(DSubReg_i32_reg imm:$lane))),
|
(DSubReg_i32_reg imm:$lane))),
|
||||||
(SubReg_i32_lane imm:$lane)))>;
|
(SubReg_i32_lane imm:$lane)))>;
|
||||||
|
|
||||||
def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
|
def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
|
||||||
(fmul (v4f32 QPR:$src2),
|
(fmul_su (v4f32 QPR:$src2),
|
||||||
(v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
|
(v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
|
||||||
(v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
|
(v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
|
||||||
(v2f32 (EXTRACT_SUBREG QPR:$src3,
|
(v2f32 (EXTRACT_SUBREG QPR:$src3,
|
||||||
(DSubReg_i32_reg imm:$lane))),
|
(DSubReg_i32_reg imm:$lane))),
|
||||||
(SubReg_i32_lane imm:$lane)))>;
|
(SubReg_i32_lane imm:$lane)))>,
|
||||||
|
Requires<[HasNEON, UseFPVMLx]>;
|
||||||
|
|
||||||
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
|
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
|
||||||
defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
|
||||||
|
@ -4706,15 +4716,17 @@ def : N3VSPat<fmul, VMULfd_sfp>;
|
||||||
// vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so
|
// vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so
|
||||||
// we want to avoid them for now. e.g., alternating vmla/vadd instructions.
|
// we want to avoid them for now. e.g., alternating vmla/vadd instructions.
|
||||||
|
|
||||||
//let neverHasSideEffects = 1 in
|
let neverHasSideEffects = 1 in
|
||||||
//def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
|
def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
|
||||||
// v2f32, fmul, fadd>;
|
v2f32, fmul_su, fadd>;
|
||||||
//def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>;
|
def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>,
|
||||||
|
Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
|
||||||
|
|
||||||
//let neverHasSideEffects = 1 in
|
let neverHasSideEffects = 1 in
|
||||||
//def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
|
def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
|
||||||
// v2f32, fmul, fsub>;
|
v2f32, fmul_su, fsub>;
|
||||||
//def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>;
|
def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>,
|
||||||
|
Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
|
||||||
|
|
||||||
// Vector Absolute used for single-precision FP
|
// Vector Absolute used for single-precision FP
|
||||||
let neverHasSideEffects = 1 in
|
let neverHasSideEffects = 1 in
|
||||||
|
|
|
@ -751,93 +751,93 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1,
|
||||||
def VMLAD : ADbI<0b11100, 0b00, 0, 0,
|
def VMLAD : ADbI<0b11100, 0b00, 0, 0,
|
||||||
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
||||||
IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
|
IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
|
||||||
[(set DPR:$Dd, (fadd (fmul DPR:$Dn, DPR:$Dm),
|
[(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
|
||||||
(f64 DPR:$Ddin)))]>,
|
(f64 DPR:$Ddin)))]>,
|
||||||
RegConstraint<"$Ddin = $Dd">,
|
RegConstraint<"$Ddin = $Dd">,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
|
|
||||||
def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
|
def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
|
||||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||||
IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
|
IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
|
||||||
[(set SPR:$Sd, (fadd (fmul SPR:$Sn, SPR:$Sm),
|
[(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
|
||||||
SPR:$Sdin))]>,
|
SPR:$Sdin))]>,
|
||||||
RegConstraint<"$Sdin = $Sd">,
|
RegConstraint<"$Sdin = $Sd">,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
|
||||||
|
|
||||||
def : Pat<(fadd DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))),
|
def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||||
(VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
|
(VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
def : Pat<(fadd SPR:$dstin, (fmul SPR:$a, SPR:$b)),
|
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
|
||||||
(VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
(VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP, UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
|
||||||
|
|
||||||
def VMLSD : ADbI<0b11100, 0b00, 1, 0,
|
def VMLSD : ADbI<0b11100, 0b00, 1, 0,
|
||||||
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
||||||
IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
|
IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
|
||||||
[(set DPR:$Dd, (fadd (fneg (fmul DPR:$Dn,DPR:$Dm)),
|
[(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
|
||||||
(f64 DPR:$Ddin)))]>,
|
(f64 DPR:$Ddin)))]>,
|
||||||
RegConstraint<"$Ddin = $Dd">,
|
RegConstraint<"$Ddin = $Dd">,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
|
|
||||||
def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
|
def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
|
||||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||||
IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
|
IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
|
||||||
[(set SPR:$Sd, (fadd (fneg (fmul SPR:$Sn, SPR:$Sm)),
|
[(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
|
||||||
SPR:$Sdin))]>,
|
SPR:$Sdin))]>,
|
||||||
RegConstraint<"$Sdin = $Sd">,
|
RegConstraint<"$Sdin = $Sd">,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
|
||||||
|
|
||||||
def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))),
|
def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||||
(VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
|
(VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
|
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
|
||||||
(VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
|
(VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
|
||||||
|
|
||||||
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
|
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
|
||||||
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
||||||
IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
|
IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
|
||||||
[(set DPR:$Dd,(fsub (fneg (fmul DPR:$Dn,DPR:$Dm)),
|
[(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
|
||||||
(f64 DPR:$Ddin)))]>,
|
(f64 DPR:$Ddin)))]>,
|
||||||
RegConstraint<"$Ddin = $Dd">,
|
RegConstraint<"$Ddin = $Dd">,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
|
|
||||||
def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
|
def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
|
||||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||||
IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
|
IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
|
||||||
[(set SPR:$Sd, (fsub (fneg (fmul SPR:$Sn, SPR:$Sm)),
|
[(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
|
||||||
SPR:$Sdin))]>,
|
SPR:$Sdin))]>,
|
||||||
RegConstraint<"$Sdin = $Sd">,
|
RegConstraint<"$Sdin = $Sd">,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
|
||||||
|
|
||||||
def : Pat<(fsub (fneg (fmul DPR:$a, (f64 DPR:$b))), DPR:$dstin),
|
def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
|
||||||
(VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
|
(VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
def : Pat<(fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin),
|
def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
|
||||||
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
|
||||||
|
|
||||||
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
|
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
|
||||||
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
||||||
IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
|
IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
|
||||||
[(set DPR:$Dd, (fsub (fmul DPR:$Dn, DPR:$Dm),
|
[(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
|
||||||
(f64 DPR:$Ddin)))]>,
|
(f64 DPR:$Ddin)))]>,
|
||||||
RegConstraint<"$Ddin = $Dd">,
|
RegConstraint<"$Ddin = $Dd">,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
|
|
||||||
def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
|
def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
|
||||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||||
IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
|
IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
|
||||||
[(set SPR:$Sd, (fsub (fmul SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
|
[(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
|
||||||
RegConstraint<"$Sdin = $Sd">,
|
RegConstraint<"$Sdin = $Sd">,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
|
||||||
|
|
||||||
def : Pat<(fsub (fmul DPR:$a, (f64 DPR:$b)), DPR:$dstin),
|
def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
|
||||||
(VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
|
(VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
|
||||||
Requires<[HasVFP2,UseVMLx]>;
|
Requires<[HasVFP2,UseFPVMLx]>;
|
||||||
def : Pat<(fsub (fmul SPR:$a, SPR:$b), SPR:$dstin),
|
def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
|
||||||
(VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
|
(VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||||
Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
|
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
|
||||||
|
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
|
@ -37,7 +37,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
|
||||||
, ARMProcFamily(Others)
|
, ARMProcFamily(Others)
|
||||||
, ARMFPUType(None)
|
, ARMFPUType(None)
|
||||||
, UseNEONForSinglePrecisionFP(false)
|
, UseNEONForSinglePrecisionFP(false)
|
||||||
, SlowVMLx(false)
|
, SlowFPVMLx(false)
|
||||||
, SlowFPBrcc(false)
|
, SlowFPBrcc(false)
|
||||||
, IsThumb(isT)
|
, IsThumb(isT)
|
||||||
, ThumbMode(Thumb1)
|
, ThumbMode(Thumb1)
|
||||||
|
|
|
@ -57,9 +57,9 @@ protected:
|
||||||
/// determine if NEON should actually be used.
|
/// determine if NEON should actually be used.
|
||||||
bool UseNEONForSinglePrecisionFP;
|
bool UseNEONForSinglePrecisionFP;
|
||||||
|
|
||||||
/// SlowVMLx - If the VFP2 instructions are available, indicates whether
|
/// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
|
||||||
/// the VML[AS] instructions are slow (if so, don't use them).
|
/// whether the FP VML[AS] instructions are slow (if so, don't use them).
|
||||||
bool SlowVMLx;
|
bool SlowFPVMLx;
|
||||||
|
|
||||||
/// SlowFPBrcc - True if floating point compare + branch is slow.
|
/// SlowFPBrcc - True if floating point compare + branch is slow.
|
||||||
bool SlowFPBrcc;
|
bool SlowFPBrcc;
|
||||||
|
@ -176,7 +176,7 @@ protected:
|
||||||
bool hasDivide() const { return HasHardwareDivide; }
|
bool hasDivide() const { return HasHardwareDivide; }
|
||||||
bool hasT2ExtractPack() const { return HasT2ExtractPack; }
|
bool hasT2ExtractPack() const { return HasT2ExtractPack; }
|
||||||
bool hasDataBarrier() const { return HasDataBarrier; }
|
bool hasDataBarrier() const { return HasDataBarrier; }
|
||||||
bool useVMLx() const {return hasVFP2() && !SlowVMLx; }
|
bool useFPVMLx() const { return !SlowFPVMLx; }
|
||||||
bool isFPBrccSlow() const { return SlowFPBrcc; }
|
bool isFPBrccSlow() const { return SlowFPBrcc; }
|
||||||
bool isFPOnlySP() const { return FPOnlySP; }
|
bool isFPOnlySP() const { return FPOnlySP; }
|
||||||
bool prefers32BitThumb() const { return Pref32BitThumb; }
|
bool prefers32BitThumb() const { return Pref32BitThumb; }
|
||||||
|
|
|
@ -16,11 +16,14 @@
|
||||||
#include "ARM.h"
|
#include "ARM.h"
|
||||||
#include "llvm/PassManager.h"
|
#include "llvm/PassManager.h"
|
||||||
#include "llvm/CodeGen/Passes.h"
|
#include "llvm/CodeGen/Passes.h"
|
||||||
|
#include "llvm/Support/CommandLine.h"
|
||||||
#include "llvm/Support/FormattedStream.h"
|
#include "llvm/Support/FormattedStream.h"
|
||||||
#include "llvm/Target/TargetOptions.h"
|
#include "llvm/Target/TargetOptions.h"
|
||||||
#include "llvm/Target/TargetRegistry.h"
|
#include "llvm/Target/TargetRegistry.h"
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
|
static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
|
||||||
|
|
||||||
static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
|
static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
|
||||||
Triple TheTriple(TT);
|
Triple TheTriple(TT);
|
||||||
switch (TheTriple.getOS()) {
|
switch (TheTriple.getOS()) {
|
||||||
|
@ -146,6 +149,9 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
|
||||||
// FIXME: temporarily disabling load / store optimization pass for Thumb1.
|
// FIXME: temporarily disabling load / store optimization pass for Thumb1.
|
||||||
if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
|
if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
|
||||||
PM.add(createARMLoadStoreOptimizationPass(true));
|
PM.add(createARMLoadStoreOptimizationPass(true));
|
||||||
|
if (ExpandMLx &&
|
||||||
|
OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
|
||||||
|
PM.add(createMLxExpansionPass());
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,7 @@ add_llvm_target(ARMCodeGen
|
||||||
ARMFastISel.cpp
|
ARMFastISel.cpp
|
||||||
ARMFrameInfo.cpp
|
ARMFrameInfo.cpp
|
||||||
ARMGlobalMerge.cpp
|
ARMGlobalMerge.cpp
|
||||||
|
ARMHazardRecognizer.cpp
|
||||||
ARMISelDAGToDAG.cpp
|
ARMISelDAGToDAG.cpp
|
||||||
ARMISelLowering.cpp
|
ARMISelLowering.cpp
|
||||||
ARMInstrInfo.cpp
|
ARMInstrInfo.cpp
|
||||||
|
@ -46,7 +47,6 @@ add_llvm_target(ARMCodeGen
|
||||||
Thumb1InstrInfo.cpp
|
Thumb1InstrInfo.cpp
|
||||||
Thumb1FrameInfo.cpp
|
Thumb1FrameInfo.cpp
|
||||||
Thumb1RegisterInfo.cpp
|
Thumb1RegisterInfo.cpp
|
||||||
Thumb2HazardRecognizer.cpp
|
|
||||||
Thumb2ITBlockPass.cpp
|
Thumb2ITBlockPass.cpp
|
||||||
Thumb2InstrInfo.cpp
|
Thumb2InstrInfo.cpp
|
||||||
Thumb2RegisterInfo.cpp
|
Thumb2RegisterInfo.cpp
|
||||||
|
|
|
@ -0,0 +1,324 @@
|
||||||
|
//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ----------=//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
|
||||||
|
// multiple and add / sub instructions) when special VMLx hazards are detected.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "mlx-expansion"
|
||||||
|
#include "ARM.h"
|
||||||
|
#include "ARMBaseInstrInfo.h"
|
||||||
|
#include "llvm/CodeGen/MachineInstr.h"
|
||||||
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||||
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||||
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||||
|
#include "llvm/Target/TargetRegisterInfo.h"
|
||||||
|
#include "llvm/ADT/DenseMap.h"
|
||||||
|
#include "llvm/ADT/SmallSet.h"
|
||||||
|
#include "llvm/ADT/Statistic.h"
|
||||||
|
#include "llvm/Support/CommandLine.h"
|
||||||
|
#include "llvm/Support/Debug.h"
|
||||||
|
#include "llvm/Support/raw_ostream.h"
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
static cl::opt<bool>
|
||||||
|
ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
|
||||||
|
static cl::opt<unsigned>
|
||||||
|
ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
|
||||||
|
|
||||||
|
STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
struct MLxExpansion : public MachineFunctionPass {
|
||||||
|
static char ID;
|
||||||
|
MLxExpansion() : MachineFunctionPass(ID) {}
|
||||||
|
|
||||||
|
virtual bool runOnMachineFunction(MachineFunction &Fn);
|
||||||
|
|
||||||
|
virtual const char *getPassName() const {
|
||||||
|
return "ARM MLA / MLS expansion pass";
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const ARMBaseInstrInfo *TII;
|
||||||
|
const TargetRegisterInfo *TRI;
|
||||||
|
MachineRegisterInfo *MRI;
|
||||||
|
|
||||||
|
unsigned HazardLimit;
|
||||||
|
unsigned MIIdx;
|
||||||
|
MachineInstr* LastMIs[4];
|
||||||
|
|
||||||
|
void clearStack();
|
||||||
|
void pushStack(MachineInstr *MI);
|
||||||
|
MachineInstr *getAccDefMI(MachineInstr *MI) const;
|
||||||
|
unsigned getDefReg(MachineInstr *MI) const;
|
||||||
|
bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
|
||||||
|
bool FindMLxHazard(MachineInstr *MI) const;
|
||||||
|
void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
|
||||||
|
unsigned MulOpc, unsigned AddSubOpc,
|
||||||
|
bool NegAcc, bool HasLane);
|
||||||
|
bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
|
||||||
|
};
|
||||||
|
char MLxExpansion::ID = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MLxExpansion::clearStack() {
|
||||||
|
std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0);
|
||||||
|
MIIdx = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MLxExpansion::pushStack(MachineInstr *MI) {
|
||||||
|
LastMIs[MIIdx] = MI;
|
||||||
|
if (++MIIdx == 4)
|
||||||
|
MIIdx = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
|
||||||
|
// Look past COPY and INSERT_SUBREG instructions to find the
|
||||||
|
// real definition MI. This is important for _sfp instructions.
|
||||||
|
unsigned Reg = MI->getOperand(1).getReg();
|
||||||
|
if (TargetRegisterInfo::isPhysicalRegister(Reg))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
MachineBasicBlock *MBB = MI->getParent();
|
||||||
|
MachineInstr *DefMI = MRI->getVRegDef(Reg);
|
||||||
|
while (true) {
|
||||||
|
if (DefMI->getParent() != MBB)
|
||||||
|
break;
|
||||||
|
if (DefMI->isCopyLike()) {
|
||||||
|
Reg = DefMI->getOperand(1).getReg();
|
||||||
|
if (TargetRegisterInfo::isVirtualRegister(Reg)) {
|
||||||
|
DefMI = MRI->getVRegDef(Reg);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if (DefMI->isInsertSubreg()) {
|
||||||
|
Reg = DefMI->getOperand(2).getReg();
|
||||||
|
if (TargetRegisterInfo::isVirtualRegister(Reg)) {
|
||||||
|
DefMI = MRI->getVRegDef(Reg);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return DefMI;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
|
||||||
|
unsigned Reg = MI->getOperand(0).getReg();
|
||||||
|
if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
|
||||||
|
!MRI->hasOneNonDBGUse(Reg))
|
||||||
|
return Reg;
|
||||||
|
|
||||||
|
MachineBasicBlock *MBB = MI->getParent();
|
||||||
|
MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg);
|
||||||
|
if (UseMI->getParent() != MBB)
|
||||||
|
return Reg;
|
||||||
|
|
||||||
|
while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
|
||||||
|
Reg = UseMI->getOperand(0).getReg();
|
||||||
|
if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
|
||||||
|
!MRI->hasOneNonDBGUse(Reg))
|
||||||
|
return Reg;
|
||||||
|
UseMI = &*MRI->use_nodbg_begin(Reg);
|
||||||
|
if (UseMI->getParent() != MBB)
|
||||||
|
return Reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
|
||||||
|
const TargetInstrDesc &TID = MI->getDesc();
|
||||||
|
// FIXME: Detect integer instructions properly.
|
||||||
|
unsigned Domain = TID.TSFlags & ARMII::DomainMask;
|
||||||
|
if (Domain == ARMII::DomainVFP) {
|
||||||
|
unsigned Opcode = TID.getOpcode();
|
||||||
|
if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
|
||||||
|
Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
|
||||||
|
return false;
|
||||||
|
} else if (Domain == ARMII::DomainNEON) {
|
||||||
|
if (TID.mayStore() || TID.mayLoad())
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return MI->readsRegister(Reg, TRI);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
|
||||||
|
if (NumExpand >= ExpandLimit)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (ForceExapnd)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
MachineInstr *DefMI = getAccDefMI(MI);
|
||||||
|
if (TII->isFpMLxInstruction(DefMI->getOpcode()))
|
||||||
|
// r0 = vmla
|
||||||
|
// r3 = vmla r0, r1, r2
|
||||||
|
// takes 16 - 17 cycles
|
||||||
|
//
|
||||||
|
// r0 = vmla
|
||||||
|
// r4 = vmul r1, r2
|
||||||
|
// r3 = vadd r0, r4
|
||||||
|
// takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
|
||||||
|
// VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
|
||||||
|
// preserves the in-order retirement of the instructions.
|
||||||
|
// Look at the next few instructions, if *most* of them can cause hazards,
|
||||||
|
// then the scheduler can't *fix* this, we'd better break up the VMLA.
|
||||||
|
for (unsigned i = 1; i <= 4; ++i) {
|
||||||
|
int Idx = ((int)MIIdx - i + 4) % 4;
|
||||||
|
MachineInstr *NextMI = LastMIs[Idx];
|
||||||
|
if (!NextMI)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (TII->canCauseFpMLxStall(NextMI->getOpcode()))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// Look for VMLx RAW hazard.
|
||||||
|
if (hasRAWHazard(getDefReg(MI), NextMI))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
|
||||||
|
/// of MUL + ADD / SUB instructions.
|
||||||
|
void
|
||||||
|
MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
|
||||||
|
unsigned MulOpc, unsigned AddSubOpc,
|
||||||
|
bool NegAcc, bool HasLane) {
|
||||||
|
unsigned DstReg = MI->getOperand(0).getReg();
|
||||||
|
bool DstDead = MI->getOperand(0).isDead();
|
||||||
|
unsigned AccReg = MI->getOperand(1).getReg();
|
||||||
|
unsigned Src1Reg = MI->getOperand(2).getReg();
|
||||||
|
unsigned Src2Reg = MI->getOperand(3).getReg();
|
||||||
|
bool Src1Kill = MI->getOperand(2).isKill();
|
||||||
|
bool Src2Kill = MI->getOperand(3).isKill();
|
||||||
|
unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
|
||||||
|
unsigned NextOp = HasLane ? 5 : 4;
|
||||||
|
ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
|
||||||
|
unsigned PredReg = MI->getOperand(++NextOp).getReg();
|
||||||
|
|
||||||
|
const TargetInstrDesc &TID1 = TII->get(MulOpc);
|
||||||
|
const TargetInstrDesc &TID2 = TII->get(AddSubOpc);
|
||||||
|
unsigned TmpReg = MRI->createVirtualRegister(TID1.getRegClass(0, TRI));
|
||||||
|
|
||||||
|
MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID1, TmpReg)
|
||||||
|
.addReg(Src1Reg, getKillRegState(Src1Kill))
|
||||||
|
.addReg(Src2Reg, getKillRegState(Src2Kill));
|
||||||
|
if (HasLane)
|
||||||
|
MIB.addImm(LaneImm);
|
||||||
|
MIB.addImm(Pred).addReg(PredReg);
|
||||||
|
|
||||||
|
MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID2)
|
||||||
|
.addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
|
||||||
|
|
||||||
|
if (NegAcc) {
|
||||||
|
bool AccKill = MRI->hasOneNonDBGUse(AccReg);
|
||||||
|
MIB.addReg(TmpReg, getKillRegState(true))
|
||||||
|
.addReg(AccReg, getKillRegState(AccKill));
|
||||||
|
} else {
|
||||||
|
MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
|
||||||
|
}
|
||||||
|
MIB.addImm(Pred).addReg(PredReg);
|
||||||
|
|
||||||
|
DEBUG({
|
||||||
|
dbgs() << "Expanding: " << *MI;
|
||||||
|
dbgs() << " to:\n";
|
||||||
|
MachineBasicBlock::iterator MII = MI;
|
||||||
|
MII = llvm::prior(MII);
|
||||||
|
MachineInstr &MI2 = *MII;
|
||||||
|
MII = llvm::prior(MII);
|
||||||
|
MachineInstr &MI1 = *MII;
|
||||||
|
dbgs() << " " << MI1;
|
||||||
|
dbgs() << " " << MI2;
|
||||||
|
});
|
||||||
|
|
||||||
|
MI->eraseFromParent();
|
||||||
|
++NumExpand;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
|
||||||
|
bool Changed = false;
|
||||||
|
|
||||||
|
clearStack();
|
||||||
|
|
||||||
|
unsigned Skip = 0;
|
||||||
|
MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
|
||||||
|
while (MII != E) {
|
||||||
|
MachineInstr *MI = &*MII;
|
||||||
|
|
||||||
|
if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) {
|
||||||
|
++MII;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const TargetInstrDesc &TID = MI->getDesc();
|
||||||
|
if (TID.isBarrier()) {
|
||||||
|
clearStack();
|
||||||
|
Skip = 0;
|
||||||
|
++MII;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned Domain = TID.TSFlags & ARMII::DomainMask;
|
||||||
|
if (Domain == ARMII::DomainGeneral) {
|
||||||
|
if (++Skip == 2)
|
||||||
|
// Assume dual issues of non-VFP / NEON instructions.
|
||||||
|
pushStack(0);
|
||||||
|
} else {
|
||||||
|
Skip = 0;
|
||||||
|
|
||||||
|
unsigned MulOpc, AddSubOpc;
|
||||||
|
bool NegAcc, HasLane;
|
||||||
|
if (!TII->isFpMLxInstruction(TID.getOpcode(),
|
||||||
|
MulOpc, AddSubOpc, NegAcc, HasLane) ||
|
||||||
|
!FindMLxHazard(MI))
|
||||||
|
pushStack(MI);
|
||||||
|
else {
|
||||||
|
ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
|
||||||
|
E = MBB.rend(); // May have changed if MI was the 1st instruction.
|
||||||
|
Changed = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
++MII;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Changed;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
|
||||||
|
TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
|
||||||
|
TRI = Fn.getTarget().getRegisterInfo();
|
||||||
|
MRI = &Fn.getRegInfo();
|
||||||
|
|
||||||
|
bool Modified = false;
|
||||||
|
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
|
||||||
|
++MFI) {
|
||||||
|
MachineBasicBlock &MBB = *MFI;
|
||||||
|
Modified |= ExpandFPMLxInstructions(MBB);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Modified;
|
||||||
|
}
|
||||||
|
|
||||||
|
FunctionPass *llvm::createMLxExpansionPass() {
|
||||||
|
return new MLxExpansion();
|
||||||
|
}
|
|
@ -1,53 +0,0 @@
|
||||||
//===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===//
|
|
||||||
//
|
|
||||||
// The LLVM Compiler Infrastructure
|
|
||||||
//
|
|
||||||
// This file is distributed under the University of Illinois Open Source
|
|
||||||
// License. See LICENSE.TXT for details.
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#include "ARM.h"
|
|
||||||
#include "Thumb2HazardRecognizer.h"
|
|
||||||
#include "llvm/CodeGen/MachineInstr.h"
|
|
||||||
#include "llvm/CodeGen/ScheduleDAG.h"
|
|
||||||
using namespace llvm;
|
|
||||||
|
|
||||||
ScheduleHazardRecognizer::HazardType
|
|
||||||
Thumb2HazardRecognizer::getHazardType(SUnit *SU) {
|
|
||||||
if (ITBlockSize) {
|
|
||||||
MachineInstr *MI = SU->getInstr();
|
|
||||||
if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1])
|
|
||||||
return Hazard;
|
|
||||||
}
|
|
||||||
|
|
||||||
return PostRAHazardRecognizer::getHazardType(SU);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Thumb2HazardRecognizer::Reset() {
|
|
||||||
ITBlockSize = 0;
|
|
||||||
PostRAHazardRecognizer::Reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) {
|
|
||||||
MachineInstr *MI = SU->getInstr();
|
|
||||||
unsigned Opcode = MI->getOpcode();
|
|
||||||
if (ITBlockSize) {
|
|
||||||
--ITBlockSize;
|
|
||||||
} else if (Opcode == ARM::t2IT) {
|
|
||||||
unsigned Mask = MI->getOperand(1).getImm();
|
|
||||||
unsigned NumTZ = CountTrailingZeros_32(Mask);
|
|
||||||
assert(NumTZ <= 3 && "Invalid IT mask!");
|
|
||||||
ITBlockSize = 4 - NumTZ;
|
|
||||||
MachineBasicBlock::iterator I = MI;
|
|
||||||
for (unsigned i = 0; i < ITBlockSize; ++i) {
|
|
||||||
// Advance to the next instruction, skipping any dbg_value instructions.
|
|
||||||
do {
|
|
||||||
++I;
|
|
||||||
} while (I->isDebugValue());
|
|
||||||
ITBlockMIs[ITBlockSize-1-i] = &*I;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PostRAHazardRecognizer::EmitInstruction(SU);
|
|
||||||
}
|
|
|
@ -1,40 +0,0 @@
|
||||||
//===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===//
|
|
||||||
//
|
|
||||||
// The LLVM Compiler Infrastructure
|
|
||||||
//
|
|
||||||
// This file is distributed under the University of Illinois Open Source
|
|
||||||
// License. See LICENSE.TXT for details.
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
//
|
|
||||||
// This file defines hazard recognizers for scheduling Thumb2 functions on
|
|
||||||
// ARM processors.
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#ifndef THUMB2HAZARDRECOGNIZER_H
|
|
||||||
#define THUMB2HAZARDRECOGNIZER_H
|
|
||||||
|
|
||||||
#include "llvm/CodeGen/PostRAHazardRecognizer.h"
|
|
||||||
|
|
||||||
namespace llvm {
|
|
||||||
|
|
||||||
class MachineInstr;
|
|
||||||
|
|
||||||
class Thumb2HazardRecognizer : public PostRAHazardRecognizer {
|
|
||||||
unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled.
|
|
||||||
MachineInstr *ITBlockMIs[4];
|
|
||||||
|
|
||||||
public:
|
|
||||||
Thumb2HazardRecognizer(const InstrItineraryData *ItinData) :
|
|
||||||
PostRAHazardRecognizer(ItinData) {}
|
|
||||||
|
|
||||||
virtual HazardType getHazardType(SUnit *SU);
|
|
||||||
virtual void Reset();
|
|
||||||
virtual void EmitInstruction(SUnit *SU);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
} // end namespace llvm
|
|
||||||
|
|
||||||
#endif // THUMB2HAZARDRECOGNIZER_H
|
|
|
@ -17,7 +17,6 @@
|
||||||
#include "ARMAddressingModes.h"
|
#include "ARMAddressingModes.h"
|
||||||
#include "ARMGenInstrInfo.inc"
|
#include "ARMGenInstrInfo.inc"
|
||||||
#include "ARMMachineFunctionInfo.h"
|
#include "ARMMachineFunctionInfo.h"
|
||||||
#include "Thumb2HazardRecognizer.h"
|
|
||||||
#include "Thumb2InstrInfo.h"
|
#include "Thumb2InstrInfo.h"
|
||||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||||
|
@ -175,11 +174,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
||||||
ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
|
ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
ScheduleHazardRecognizer *Thumb2InstrInfo::
|
|
||||||
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const {
|
|
||||||
return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
|
void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
|
||||||
MachineBasicBlock::iterator &MBBI, DebugLoc dl,
|
MachineBasicBlock::iterator &MBBI, DebugLoc dl,
|
||||||
unsigned DestReg, unsigned BaseReg, int NumBytes,
|
unsigned DestReg, unsigned BaseReg, int NumBytes,
|
||||||
|
|
|
@ -65,9 +65,6 @@ public:
|
||||||
/// always be able to get register info as well (through this method).
|
/// always be able to get register info as well (through this method).
|
||||||
///
|
///
|
||||||
const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
|
const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
|
||||||
|
|
||||||
ScheduleHazardRecognizer *
|
|
||||||
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
|
/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
|
||||||
|
|
|
@ -270,8 +270,9 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
|
||||||
define arm_aapcs_vfpcc i32 @t10() nounwind {
|
define arm_aapcs_vfpcc i32 @t10() nounwind {
|
||||||
entry:
|
entry:
|
||||||
; CHECK: t10:
|
; CHECK: t10:
|
||||||
|
; CHECK: vmul.f32 q8, q8, d0[0]
|
||||||
; CHECK: vmov.i32 q9, #0x3F000000
|
; CHECK: vmov.i32 q9, #0x3F000000
|
||||||
; CHECK: vmla.f32 q8, q8, d0[0]
|
; CHECK: vadd.f32 q8, q8, q8
|
||||||
%0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
|
%0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
|
||||||
%1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
|
%1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
|
||||||
%2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]
|
%2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]
|
||||||
|
|
Loading…
Reference in New Issue