[ARM] MVE VPT Block Pass

Initial commit of a new pass to create vector predication blocks, called VPT
blocks, that are supported by the Armv8.1-M MVE architecture.

This is a first naive implementation. I.e., for 2 consecutive predicated
instructions I1 and I2, for example, it will generate 2 VPT blocks:

VPST
I1
VPST
I2

A more optimal implementation would obviously put instructions in the same VPT
block when they are predicated on the same condition and when it is allowed to
do this:

VPTT
I1
I2

We will address this optimisation with follow up patches when the groundwork is
in. Creating VPT Blocks is very similar to IT Blocks, which is the reason I
added this to Thumb2ITBlocks.cpp. This allows reuse of the def use analysis
that we need for the more optimal implementation.

VPT blocks cannot be nested in IT blocks, and vice versa, and so these 2 passes
cannot interact with each other. Instructions allowed in VPT blocks must
be MVE instructions that are marked as VPT compatible.

Differential Revision: https://reviews.llvm.org/D63247

llvm-svn: 363370
This commit is contained in:
Sjoerd Meijer 2019-06-14 11:46:05 +00:00
parent 0f15ba98f5
commit 3058a62b90
7 changed files with 227 additions and 0 deletions

View File

@ -46,6 +46,7 @@ FunctionPass *createARMCodeGenPreparePass();
FunctionPass *createARMConstantIslandPass();
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
FunctionPass *createMVEVPTBlockPass();
FunctionPass *createARMOptimizeBarriersPass();
FunctionPass *createThumb2SizeReductionPass(
std::function<bool(const Function &)> Ftor = nullptr);
@ -68,6 +69,7 @@ void initializeARMCodeGenPreparePass(PassRegistry &);
void initializeARMConstantIslandsPass(PassRegistry &);
void initializeARMExpandPseudoPass(PassRegistry &);
void initializeThumb2SizeReducePass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
} // end namespace llvm

View File

@ -95,6 +95,7 @@ extern "C" void LLVMInitializeARMTarget() {
initializeARMExecutionDomainFixPass(Registry);
initializeARMExpandPseudoPass(Registry);
initializeThumb2SizeReducePass(Registry);
initializeMVEVPTBlockPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@ -508,6 +509,7 @@ void ARMPassConfig::addPreSched2() {
return !MF.getSubtarget<ARMSubtarget>().isThumb1Only();
}));
}
addPass(createMVEVPTBlockPass());
addPass(createThumb2ITBlockPass());
}

View File

@ -316,3 +316,123 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
FunctionPass *llvm::createThumb2ITBlockPass() {
return new Thumb2ITBlockPass();
}
#undef DEBUG_TYPE
#define DEBUG_TYPE "arm-mve-vpt"
namespace {
class MVEVPTBlock : public MachineFunctionPass {
public:
static char ID;
const Thumb2InstrInfo *TII;
const TargetRegisterInfo *TRI;
MVEVPTBlock() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &Fn) override;
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}
StringRef getPassName() const override {
return "MVE VPT block insertion pass";
}
private:
bool InsertVPTBlocks(MachineBasicBlock &MBB);
};
char MVEVPTBlock::ID = 0;
} // end anonymous namespace
INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
enum VPTMaskValue {
T = 8, // 0b1000
TT = 4, // 0b0100
TE = 12, // 0b1100
TTT = 2, // 0b0010
TTE = 6, // 0b0110
TEE = 10, // 0b1010
TET = 14, // 0b1110
TTTT = 1, // 0b0001
TTTE = 3, // 0b0011
TTEE = 5, // 0b0101
TTET = 7, // 0b0111
TEEE = 9, // 0b1001
TEET = 11, // 0b1011
TETT = 13, // 0b1101
TETE = 15 // 0b1111
};
bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
bool Modified = false;
MachineBasicBlock::iterator MBIter = Block.begin();
MachineBasicBlock::iterator EndIter = Block.end();
while (MBIter != EndIter) {
MachineInstr *MI = &*MBIter;
unsigned PredReg = 0;
DebugLoc dl = MI->getDebugLoc();
ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
// The idea of the predicate is that None, Then and Else are for use when
// handling assembly language: they correspond to the three possible
// suffixes "", "t" and "e" on the mnemonic. So when instructions are read
// from assembly source or disassembled from object code, you expect to see
// a mixture whenever there's a long VPT block. But in code generation, we
// hope we'll never generate an Else as input to this pass.
assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
if (Pred == ARMVCC::None) {
++MBIter;
continue;
}
MachineInstrBuilder MIBuilder =
BuildMI(Block, MBIter, dl, TII->get(ARM::t2VPST));
MachineInstr *LastITMI = MI;
MachineBasicBlock::iterator InsertPos = MIBuilder.getInstr();
// The mask value for the VPST instruction is T = 0b1000 = 8
MIBuilder.addImm(VPTMaskValue::T);
finalizeBundle(Block, InsertPos.getInstrIterator(),
++LastITMI->getIterator());
Modified = true;
LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump(););
++MBIter;
}
return Modified;
}
bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
return false;
TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
TRI = STI.getRegisterInfo();
LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
<< "********** Function: " << Fn.getName() << '\n');
bool Modified = false;
for (MachineBasicBlock &MBB : Fn)
Modified |= InsertVPTBlocks(MBB);
LLVM_DEBUG(dbgs() << "**************************************\n");
return Modified;
}
/// createMVEVPTBlock - Returns an instance of the MVE VPT block
/// insertion pass.
FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }

View File

@ -685,3 +685,28 @@ ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
return ARMCC::AL;
return getInstrPredicate(MI, PredReg);
}
int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
const MCInstrDesc &MCID = MI.getDesc();
if (!MCID.OpInfo)
return -1;
for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
if (ARM::isVpred(MCID.OpInfo[i].OperandType))
return i;
return -1;
}
ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
unsigned &PredReg) {
int PIdx = findFirstVPTPredOperandIdx(MI);
if (PIdx == -1) {
PredReg = 0;
return ARMVCC::None;
}
PredReg = MI.getOperand(PIdx+1).getReg();
return (ARMVCC::VPTCodes)MI.getOperand(PIdx).getImm();
}

View File

@ -68,6 +68,12 @@ private:
/// to llvm::getInstrPredicate except it returns AL for conditional branch
/// instructions which are "predicated", but are not in IT blocks.
ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
// getVPTInstrPredicate: VPT analogue of that, plus a helper function
// corresponding to MachineInstr::findFirstPredOperandIdx.
int findFirstVPTPredOperandIdx(const MachineInstr &MI);
ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI,
unsigned &PredReg);
}
#endif

View File

@ -125,6 +125,7 @@
; CHECK-NEXT: Machine Natural Loop Construction
; CHECK-NEXT: Machine Block Frequency Analysis
; CHECK-NEXT: If Converter
; CHECK-NEXT: MVE VPT block insertion pass
; CHECK-NEXT: Thumb IT blocks insertion pass
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction

View File

@ -0,0 +1,71 @@
# RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s
--- |
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main-arm-none-eabi"
define hidden arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32_v2(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
entry:
%conv.i = zext i16 %p to i32
%0 = tail call nnan ninf nsz <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i32 %conv.i) #2
ret <4 x float> %0
}
declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
...
---
name: test_vminnmq_m_f32_v2
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$q0', virtual-reg: '' }
- { reg: '$q1', virtual-reg: '' }
- { reg: '$q2', virtual-reg: '' }
- { reg: '$r0', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 0
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack: []
constants: []
body: |
bb.0.entry:
liveins: $q0, $q1, $q2, $r0
; CHECK: VPST 8, implicit-def $p0
; CHECK-NEXT: $q0 = nnan ninf nsz VMINNMf32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0
$vpr = VMSR_P0 killed $r0, 14, $noreg
renamable $q0 = nnan ninf nsz VMINNMf32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0
tBX_RET 14, $noreg, implicit $q0
...