llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

5978 lines
216 KiB
C++
Raw Normal View History

//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file implements the targeting of the InstructionSelector class for
/// AArch64.
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterBankInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "aarch64-isel"
using namespace llvm;
using namespace MIPatternMatch;
namespace {
#define GET_GLOBALISEL_PREDICATE_BITSET
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATE_BITSET
class AArch64InstructionSelector : public InstructionSelector {
public:
AArch64InstructionSelector(const AArch64TargetMachine &TM,
const AArch64Subtarget &STI,
const AArch64RegisterBankInfo &RBI);
bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
void setupMF(MachineFunction &MF, GISelKnownBits &KB,
CodeGenCoverage &CoverageInfo) override {
InstructionSelector::setupMF(MF, KB, CoverageInfo);
// hasFnAttribute() is expensive to call on every BRCOND selection, so
// cache it here for each run of the selector.
ProduceNonFlagSettingCondBr =
!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
MFReturnAddr = Register();
processPHIs(MF);
}
private:
/// tblgen-erated 'select' implementation, used as the initial selector for
/// the patterns that don't require complex C++.
2017-11-16 08:46:35 +08:00
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
// A lowering phase that runs before any selection attempts.
// Returns true if the instruction was modified.
bool preISelLower(MachineInstr &I);
// An early selection function that runs before the selectImpl() call.
bool earlySelect(MachineInstr &I) const;
// Do some preprocessing of G_PHIs before we begin selection.
void processPHIs(MachineFunction &MF);
bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
/// Eliminate same-sized cross-bank copies into stores before selectImpl().
bool contractCrossBankCopyIntoStore(MachineInstr &I,
MachineRegisterInfo &MRI);
bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
///@{
/// Helper functions for selectCompareBranch.
bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
MachineIRBuilder &MIB) const;
bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
MachineIRBuilder &MIB) const;
bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
MachineIRBuilder &MIB) const;
bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
MachineBasicBlock *DstMBB,
MachineIRBuilder &MIB) const;
///@}
bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
// Helper to generate an equivalent of scalar_to_vector into a new register,
// returned via 'Dst'.
MachineInstr *emitScalarToVector(unsigned EltSize,
const TargetRegisterClass *DstRC,
Register Scalar,
MachineIRBuilder &MIRBuilder) const;
/// Emit a lane insert into \p DstReg, or a new vector register if None is
/// provided.
///
/// The lane inserted into is defined by \p LaneIdx. The vector source
/// register is given by \p SrcReg. The register containing the element is
/// given by \p EltReg.
MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
Register EltReg, unsigned LaneIdx,
const RegisterBank &RB,
MachineIRBuilder &MIRBuilder) const;
bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
MachineRegisterInfo &MRI) const;
bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectSplitVectorUnmerge(MachineInstr &I,
MachineRegisterInfo &MRI) const;
bool selectIntrinsicWithSideEffects(MachineInstr &I,
MachineRegisterInfo &MRI) const;
bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const;
unsigned emitConstantPoolEntry(const Constant *CPVal,
MachineFunction &MF) const;
MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
MachineIRBuilder &MIRBuilder) const;
// Emit a vector concat operation.
MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
Register Op2,
MachineIRBuilder &MIRBuilder) const;
// Emit an integer compare between LHS and RHS, which checks for Predicate.
MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
/// Emit a floating point comparison between \p LHS and \p RHS.
MachineInstr *emitFPCompare(Register LHS, Register RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitInstr(unsigned Opcode,
std::initializer_list<llvm::DstOp> DstOps,
std::initializer_list<llvm::SrcOp> SrcOps,
MachineIRBuilder &MIRBuilder,
const ComplexRendererFns &RenderFns = None) const;
/// Helper function to emit an add or sub instruction.
///
/// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
/// in a specific order.
///
/// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
///
/// \code
/// const std::array<std::array<unsigned, 2>, 4> Table {
/// {{AArch64::ADDXri, AArch64::ADDWri},
/// {AArch64::ADDXrs, AArch64::ADDWrs},
/// {AArch64::ADDXrr, AArch64::ADDWrr},
/// {AArch64::SUBXri, AArch64::SUBWri},
/// {AArch64::ADDXrx, AArch64::ADDWrx}}};
/// \endcode
///
/// Each row in the table corresponds to a different addressing mode. Each
/// column corresponds to a different register size.
///
/// \attention Rows must be structured as follows:
/// - Row 0: The ri opcode variants
/// - Row 1: The rs opcode variants
/// - Row 2: The rr opcode variants
/// - Row 3: The ri opcode variants for negative immediates
/// - Row 4: The rx opcode variants
///
/// \attention Columns must be structured as follows:
/// - Column 0: The 64-bit opcode variants
/// - Column 1: The 32-bit opcode variants
///
/// \p Dst is the destination register of the binop to emit.
/// \p LHS is the left-hand operand of the binop to emit.
/// \p RHS is the right-hand operand of the binop to emit.
MachineInstr *emitAddSub(
const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
Register Dst, MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
AArch64CC::CondCode CC,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
const RegisterBank &DstRB, LLT ScalarTy,
Register VecReg, unsigned LaneIdx,
MachineIRBuilder &MIRBuilder) const;
/// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be
/// materialized using a FMOV instruction, then update MI and return it.
/// Otherwise, do nothing and return a nullptr.
MachineInstr *emitFMovForFConstant(MachineInstr &MI,
MachineRegisterInfo &MRI) const;
/// Emit a CSet for an integer compare.
///
/// \p DefReg is expected to be a 32-bit scalar register.
MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
MachineIRBuilder &MIRBuilder) const;
/// Emit a CSet for a FP compare.
///
/// \p Dst is expected to be a 32-bit scalar register.
MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
MachineIRBuilder &MIRBuilder) const;
/// Emit the overflow op for \p Opcode.
///
/// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
/// G_USUBO, etc.
std::pair<MachineInstr *, AArch64CC::CondCode>
emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
/// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
/// \p IsNegative is true if the test should be "not zero".
/// This will also optimize the test bit instruction when possible.
MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
MachineBasicBlock *DstMBB,
MachineIRBuilder &MIB) const;
/// Emit a CB(N)Z instruction which branches to \p DestMBB.
MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
MachineBasicBlock *DestMBB,
MachineIRBuilder &MIB) const;
// Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
// We use these manually instead of using the importer since it doesn't
// support SDNodeXForm.
ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
unsigned Size) const;
ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 1);
}
ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 2);
}
ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 4);
}
ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 8);
}
ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 16);
}
/// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
/// from complex pattern matchers like selectAddrModeIndexed().
ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
MachineRegisterInfo &MRI) const;
ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
unsigned Size) const;
template <int Width>
ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
return selectAddrModeIndexed(Root, Width / 8);
}
bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
const MachineRegisterInfo &MRI) const;
ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand &Root,
unsigned SizeInBytes) const;
/// Returns a \p ComplexRendererFns which contains a base, offset, and whether
/// or not a shift + extend should be folded into an addressing mode. Returns
/// None when this is not profitable or possible.
ComplexRendererFns
selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
MachineOperand &Offset, unsigned SizeInBytes,
bool WantsExt) const;
ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
unsigned SizeInBytes) const;
template <int Width>
ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
return selectAddrModeXRO(Root, Width / 8);
}
ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
unsigned SizeInBytes) const;
template <int Width>
ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
return selectAddrModeWRO(Root, Width / 8);
}
ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
return selectShiftedRegister(Root);
}
ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
// TODO: selectShiftedRegister should allow for rotates on logical shifts.
// For now, make them the same. The only difference between the two is that
// logical shifts are allowed to fold in rotates. Otherwise, these are
// functionally the same.
return selectShiftedRegister(Root);
}
/// Given an extend instruction, determine the correct shift-extend type for
/// that instruction.
///
/// If the instruction is going to be used in a load or store, pass
/// \p IsLoadStore = true.
AArch64_AM::ShiftExtendType
getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
bool IsLoadStore = false) const;
/// Move \p Reg to \p RC if \p Reg is not already on \p RC.
///
/// \returns Either \p Reg if no change was necessary, or the new register
/// created by moving \p Reg.
///
/// Note: This uses emitCopy right now.
Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
MachineIRBuilder &MIB) const;
ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
TableGen/GlobalISel: Add way for SDNodeXForm to work on timm The current implementation assumes there is an instruction associated with the transform, but this is not the case for timm/TargetConstant/immarg values. These transforms should directly operate on a specific MachineOperand in the source instruction. TableGen would assert if you attempted to define an equivalent GISDNodeXFormEquiv using timm when it failed to find the instruction matcher. Specially recognize SDNodeXForms on timm, and pass the operand index to the render function. Ideally this would be a separate render function type that looks like void renderFoo(MachineInstrBuilder, const MachineOperand&), but this proved to be somewhat mechanically painful. Add an optional operand index which will only be passed if the transform should only look at the one source operand. Theoretically it would also be possible to only ever pass the MachineOperand, and the existing renderers would check the parent. I think that would be somewhat ugly for the standard usage which may want to inspect other operands, and I also think MachineOperand should eventually not carry a pointer to the parent instruction. Use it in one sample pattern. This isn't a great example, since the transform exists to satisfy DAG type constraints. This could also be avoided by just changing the MachineInstr's arbitrary choice of operand type from i16 to i32. Other patterns have nontrivial uses, but this serves as the simplest example. One flaw this still has is if you try to use an SDNodeXForm defined for imm, but the source pattern uses timm, you still see the "Failed to lookup instruction" assert. However, there is now a way to avoid it.
2020-01-09 01:53:15 +08:00
void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx = -1) const;
void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx = -1) const;
void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx = -1) const;
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
void materializeLargeCMVal(MachineInstr &I, const Value *V,
unsigned OpFlags) const;
// Optimization methods.
bool tryOptSelect(MachineInstr &MI) const;
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
/// Return true if \p MI is a load or store of \p NumBytes bytes.
bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
/// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
/// register zeroed out. In other words, the result of MI has been explicitly
/// zero extended.
bool isDef32(const MachineInstr &MI) const;
const AArch64TargetMachine &TM;
const AArch64Subtarget &STI;
const AArch64InstrInfo &TII;
const AArch64RegisterInfo &TRI;
const AArch64RegisterBankInfo &RBI;
bool ProduceNonFlagSettingCondBr = false;
// Some cached values used during selection.
// We use LR as a live-in register, and we keep track of it here as it can be
// clobbered by calls.
Register MFReturnAddr;
#define GET_GLOBALISEL_PREDICATES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
// We declare the temporaries used by selectImpl() in the class to minimize the
// cost of constructing placeholder values.
#define GET_GLOBALISEL_TEMPORARIES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_TEMPORARIES_DECL
};
} // end anonymous namespace
#define GET_GLOBALISEL_IMPL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_IMPL
AArch64InstructionSelector::AArch64InstructionSelector(
const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
const AArch64RegisterBankInfo &RBI)
: InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
TRI(*STI.getRegisterInfo()), RBI(RBI),
#define GET_GLOBALISEL_PREDICATES_INIT
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_TEMPORARIES_INIT
{
}
// FIXME: This should be target-independent, inferred from the types declared
// for each class in the bank.
static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
const RegisterBankInfo &RBI,
bool GetAllRegSet = false) {
if (RB.getID() == AArch64::GPRRegBankID) {
if (Ty.getSizeInBits() <= 32)
return GetAllRegSet ? &AArch64::GPR32allRegClass
: &AArch64::GPR32RegClass;
if (Ty.getSizeInBits() == 64)
return GetAllRegSet ? &AArch64::GPR64allRegClass
: &AArch64::GPR64RegClass;
return nullptr;
}
if (RB.getID() == AArch64::FPRRegBankID) {
if (Ty.getSizeInBits() <= 16)
return &AArch64::FPR16RegClass;
if (Ty.getSizeInBits() == 32)
return &AArch64::FPR32RegClass;
if (Ty.getSizeInBits() == 64)
return &AArch64::FPR64RegClass;
if (Ty.getSizeInBits() == 128)
return &AArch64::FPR128RegClass;
return nullptr;
}
return nullptr;
}
/// Given a register bank, and size in bits, return the smallest register class
/// that can represent that combination.
static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
bool GetAllRegSet = false) {
unsigned RegBankID = RB.getID();
if (RegBankID == AArch64::GPRRegBankID) {
if (SizeInBits <= 32)
return GetAllRegSet ? &AArch64::GPR32allRegClass
: &AArch64::GPR32RegClass;
if (SizeInBits == 64)
return GetAllRegSet ? &AArch64::GPR64allRegClass
: &AArch64::GPR64RegClass;
}
if (RegBankID == AArch64::FPRRegBankID) {
switch (SizeInBits) {
default:
return nullptr;
case 8:
return &AArch64::FPR8RegClass;
case 16:
return &AArch64::FPR16RegClass;
case 32:
return &AArch64::FPR32RegClass;
case 64:
return &AArch64::FPR64RegClass;
case 128:
return &AArch64::FPR128RegClass;
}
}
return nullptr;
}
/// Returns the correct subregister to use for a given register class.
static bool getSubRegForClass(const TargetRegisterClass *RC,
const TargetRegisterInfo &TRI, unsigned &SubReg) {
switch (TRI.getRegSizeInBits(*RC)) {
case 8:
SubReg = AArch64::bsub;
break;
case 16:
SubReg = AArch64::hsub;
break;
case 32:
if (RC != &AArch64::FPR32RegClass)
SubReg = AArch64::sub_32;
else
SubReg = AArch64::ssub;
break;
case 64:
SubReg = AArch64::dsub;
break;
default:
LLVM_DEBUG(
dbgs() << "Couldn't find appropriate subregister for register class.");
return false;
}
return true;
}
/// Returns the minimum size the given register bank can hold.
static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
switch (RB.getID()) {
case AArch64::GPRRegBankID:
return 32;
case AArch64::FPRRegBankID:
return 8;
default:
llvm_unreachable("Tried to get minimum size for unknown register bank.");
}
}
static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
auto &MI = *Root.getParent();
auto &MBB = *MI.getParent();
auto &MF = *MBB.getParent();
auto &MRI = MF.getRegInfo();
uint64_t Immed;
if (Root.isImm())
Immed = Root.getImm();
else if (Root.isCImm())
Immed = Root.getCImm()->getZExtValue();
else if (Root.isReg()) {
auto ValAndVReg =
getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
if (!ValAndVReg)
return None;
Immed = ValAndVReg->Value.getSExtValue();
} else
return None;
return Immed;
}
/// Check whether \p I is a currently unsupported binary operation:
/// - it has an unsized type
/// - an operand is not a vreg
/// - all operands are not in the same bank
/// These are checks that should someday live in the verifier, but right now,
/// these are mostly limitations of the aarch64 selector.
static bool unsupportedBinOp(const MachineInstr &I,
const AArch64RegisterBankInfo &RBI,
const MachineRegisterInfo &MRI,
const AArch64RegisterInfo &TRI) {
LLT Ty = MRI.getType(I.getOperand(0).getReg());
if (!Ty.isValid()) {
LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
return true;
}
const RegisterBank *PrevOpBank = nullptr;
for (auto &MO : I.operands()) {
// FIXME: Support non-register operands.
if (!MO.isReg()) {
LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
return true;
}
// FIXME: Can generic operations have physical registers operands? If
// so, this will need to be taught about that, and we'll need to get the
// bank out of the minimal class for the register.
// Either way, this needs to be documented (and possibly verified).
if (!Register::isVirtualRegister(MO.getReg())) {
LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
return true;
}
const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
if (!OpBank) {
LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
return true;
}
if (PrevOpBank && OpBank != PrevOpBank) {
LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
return true;
}
PrevOpBank = OpBank;
}
return false;
}
/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
[AArch64][GlobalISel] Legalize narrow scalar ops again. Since r279760, we've been marking as legal operations on narrow integer types that have wider legal equivalents (for instance, G_ADD s8). Compared to legalizing these operations, this reduced the amount of extends/truncates required, but was always a weird legalization decision made at selection time. So far, we haven't been able to formalize it in a way that permits the selector generated from SelectionDAG patterns to be sufficient. Using a wide instruction (say, s64), when a narrower instruction exists (s32) would introduce register class incompatibilities (when one narrow generic instruction is selected to the wider variant, but another is selected to the narrower variant). It's also impractical to limit which narrow operations are matched for which instruction, as restricting "narrow selection" to ranges of types clashes with potentially incompatible instruction predicates. Concerns were also raised regarding MIPS64's sign-extended register assumptions, as well as wrapping behavior. See discussions in https://reviews.llvm.org/D26878. Instead, legalize the operations. Should we ever revert to selecting these narrow operations, we should try to represent this more accurately: for instance, by separating a "concrete" type on operations, and an "underlying" type on vregs, we could move the "this narrow-looking op is really legal" decision to the legalizer, and let the selector use the "underlying" vreg type only, which would be guaranteed to map to a register class. In any case, we eventually should mitigate: - the performance impact by selecting no-op extract/truncates to COPYs (which we currently do), and the COPYs to register reuses (which we don't do yet). - the compile-time impact by optimizing away extract/truncate sequences in the legalizer. llvm-svn: 292827
2017-01-24 05:10:05 +08:00
/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
/// and of size \p OpSize.
/// \returns \p GenericOpc if the combination is unsupported.
static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
unsigned OpSize) {
switch (RegBankID) {
case AArch64::GPRRegBankID:
if (OpSize == 32) {
switch (GenericOpc) {
case TargetOpcode::G_SHL:
return AArch64::LSLVWr;
case TargetOpcode::G_LSHR:
return AArch64::LSRVWr;
case TargetOpcode::G_ASHR:
return AArch64::ASRVWr;
default:
return GenericOpc;
}
} else if (OpSize == 64) {
switch (GenericOpc) {
case TargetOpcode::G_PTR_ADD:
return AArch64::ADDXrr;
case TargetOpcode::G_SHL:
return AArch64::LSLVXr;
case TargetOpcode::G_LSHR:
return AArch64::LSRVXr;
case TargetOpcode::G_ASHR:
return AArch64::ASRVXr;
default:
return GenericOpc;
}
}
break;
case AArch64::FPRRegBankID:
switch (OpSize) {
case 32:
switch (GenericOpc) {
case TargetOpcode::G_FADD:
return AArch64::FADDSrr;
case TargetOpcode::G_FSUB:
return AArch64::FSUBSrr;
case TargetOpcode::G_FMUL:
return AArch64::FMULSrr;
case TargetOpcode::G_FDIV:
return AArch64::FDIVSrr;
default:
return GenericOpc;
}
case 64:
switch (GenericOpc) {
case TargetOpcode::G_FADD:
return AArch64::FADDDrr;
case TargetOpcode::G_FSUB:
return AArch64::FSUBDrr;
case TargetOpcode::G_FMUL:
return AArch64::FMULDrr;
case TargetOpcode::G_FDIV:
return AArch64::FDIVDrr;
case TargetOpcode::G_OR:
return AArch64::ORRv8i8;
default:
return GenericOpc;
}
}
break;
}
return GenericOpc;
}
/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
/// appropriate for the (value) register bank \p RegBankID and of memory access
/// size \p OpSize. This returns the variant with the base+unsigned-immediate
/// addressing mode (e.g., LDRXui).
/// \returns \p GenericOpc if the combination is unsupported.
static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
unsigned OpSize) {
const bool isStore = GenericOpc == TargetOpcode::G_STORE;
switch (RegBankID) {
case AArch64::GPRRegBankID:
switch (OpSize) {
case 8:
return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
case 16:
return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
case 32:
return isStore ? AArch64::STRWui : AArch64::LDRWui;
case 64:
return isStore ? AArch64::STRXui : AArch64::LDRXui;
}
break;
case AArch64::FPRRegBankID:
switch (OpSize) {
case 8:
return isStore ? AArch64::STRBui : AArch64::LDRBui;
case 16:
return isStore ? AArch64::STRHui : AArch64::LDRHui;
case 32:
return isStore ? AArch64::STRSui : AArch64::LDRSui;
case 64:
return isStore ? AArch64::STRDui : AArch64::LDRDui;
}
break;
}
return GenericOpc;
}
#ifndef NDEBUG
/// Helper function that verifies that we have a valid copy at the end of
/// selectCopy. Verifies that the source and dest have the expected sizes and
/// then returns true.
static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
const Register DstReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
// Make sure the size of the source and dest line up.
assert(
(DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
(Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
// Copies are a mean to copy bits around, as long as we are
// on the same register class, that's fine. Otherwise, that
// means we need some SUBREG_TO_REG or AND & co.
(((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
"Copy with different width?!");
// Check the size of the destination.
assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
"GPRs cannot get more than 64-bit width values");
return true;
}
#endif
/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
/// to \p *To.
///
/// E.g "To = COPY SrcReg:SubReg"
static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
const RegisterBankInfo &RBI, Register SrcReg,
const TargetRegisterClass *To, unsigned SubReg) {
assert(SrcReg.isValid() && "Expected a valid source register?");
assert(To && "Destination register class cannot be null");
assert(SubReg && "Expected a valid subregister");
MachineIRBuilder MIB(I);
auto SubRegCopy =
MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(SubRegCopy.getReg(0));
// It's possible that the destination register won't be constrained. Make
// sure that happens.
if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
return true;
}
/// Helper function to get the source and destination register classes for a
/// copy. Returns a std::pair containing the source register class for the
/// copy, and the destination register class for the copy. If a register class
/// cannot be determined, then it will be nullptr.
static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
// Special casing for cross-bank copies of s1s. We can technically represent
// a 1-bit value with any size of register. The minimum size for a GPR is 32
// bits. So, we need to put the FPR on 32 bits as well.
//
// FIXME: I'm not sure if this case holds true outside of copies. If it does,
// then we can pull it into the helpers that get the appropriate class for a
// register bank. Or make a new helper that carries along some constraint
// information.
if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
SrcSize = DstSize = 32;
return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
getMinClassForRegBank(DstRegBank, DstSize, true)};
}
static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
// Find the correct register classes for the source and destination registers.
const TargetRegisterClass *SrcRC;
const TargetRegisterClass *DstRC;
std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
if (!DstRC) {
LLVM_DEBUG(dbgs() << "Unexpected dest size "
<< RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
return false;
}
// A couple helpers below, for making sure that the copy we produce is valid.
// Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
// to verify that the src and dst are the same size, since that's handled by
// the SUBREG_TO_REG.
bool KnownValid = false;
// Returns true, or asserts if something we don't expect happens. Instead of
// returning true, we return isValidCopy() to ensure that we verify the
// result.
auto CheckCopy = [&]() {
// If we have a bitcast or something, we can't have physical registers.
assert((I.isCopy() ||
(!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
!Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
"No phys reg on generic operator!");
bool ValidCopy = true;
#ifndef NDEBUG
ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
assert(ValidCopy && "Invalid copy.");
#endif
return ValidCopy;
};
// Is this a copy? If so, then we may need to insert a subregister copy.
if (I.isCopy()) {
// Yes. Check if there's anything to fix up.
if (!SrcRC) {
LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
return false;
}
unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
unsigned SubReg;
// If the source bank doesn't support a subregister copy small enough,
// then we first need to copy to the destination bank.
if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
const TargetRegisterClass *DstTempRC =
getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
getSubRegForClass(DstRC, TRI, SubReg);
MachineIRBuilder MIB(I);
auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
} else if (SrcSize > DstSize) {
// If the source register is bigger than the destination we need to
// perform a subregister copy.
const TargetRegisterClass *SubRegRC =
getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
getSubRegForClass(SubRegRC, TRI, SubReg);
copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
} else if (DstSize > SrcSize) {
// If the destination register is bigger than the source we need to do
// a promotion using SUBREG_TO_REG.
const TargetRegisterClass *PromotionRC =
getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
getSubRegForClass(SrcRC, TRI, SubReg);
Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
.addImm(0)
.addUse(SrcReg)
.addImm(SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(PromoteReg);
// Promise that the copy is implicitly validated by the SUBREG_TO_REG.
KnownValid = true;
}
// If the destination is a physical register, then there's nothing to
// change, so we're done.
if (Register::isPhysicalRegister(DstReg))
return CheckCopy();
}
// No need to constrain SrcReg. It will get constrained when we hit another
// of its use or its defs. Copies do not have constraints.
if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
<< " operand\n");
return false;
}
I.setDesc(TII.get(AArch64::COPY));
return CheckCopy();
}
static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
if (!DstTy.isScalar() || !SrcTy.isScalar())
return GenericOpc;
const unsigned DstSize = DstTy.getSizeInBits();
const unsigned SrcSize = SrcTy.getSizeInBits();
switch (DstSize) {
case 32:
switch (SrcSize) {
case 32:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUWSri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUWSri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUWSr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUWSr;
default:
return GenericOpc;
}
case 64:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUXSri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUXSri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUWDr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUWDr;
default:
return GenericOpc;
}
default:
return GenericOpc;
}
case 64:
switch (SrcSize) {
case 32:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUWDri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUWDri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUXSr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUXSr;
default:
return GenericOpc;
}
case 64:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUXDri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUXDri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUXDr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUXDr;
default:
return GenericOpc;
}
default:
return GenericOpc;
}
default:
return GenericOpc;
};
return GenericOpc;
}
MachineInstr *
AArch64InstructionSelector::emitSelect(Register Dst, Register True,
Register False, AArch64CC::CondCode CC,
MachineIRBuilder &MIB) const {
MachineRegisterInfo &MRI = *MIB.getMRI();
assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
RBI.getRegBank(True, MRI, TRI)->getID() &&
"Expected both select operands to have the same regbank?");
LLT Ty = MRI.getType(True);
if (Ty.isVector())
return nullptr;
const unsigned Size = Ty.getSizeInBits();
assert((Size == 32 || Size == 64) &&
"Expected 32 bit or 64 bit select only?");
const bool Is32Bit = Size == 32;
if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
return &*FCSel;
}
// By default, we'll try and emit a CSEL.
unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
bool Optimized = false;
auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
&Optimized](Register &Reg, Register &OtherReg,
bool Invert) {
if (Optimized)
return false;
// Attempt to fold:
//
// %sub = G_SUB 0, %x
// %select = G_SELECT cc, %reg, %sub
//
// Into:
// %select = CSNEG %reg, %x, cc
Register MatchReg;
if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
Reg = MatchReg;
if (Invert) {
CC = AArch64CC::getInvertedCondCode(CC);
std::swap(Reg, OtherReg);
}
return true;
}
// Attempt to fold:
//
// %xor = G_XOR %x, -1
// %select = G_SELECT cc, %reg, %xor
//
// Into:
// %select = CSINV %reg, %x, cc
if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
Reg = MatchReg;
if (Invert) {
CC = AArch64CC::getInvertedCondCode(CC);
std::swap(Reg, OtherReg);
}
return true;
}
// Attempt to fold:
//
// %add = G_ADD %x, 1
// %select = G_SELECT cc, %reg, %add
//
// Into:
// %select = CSINC %reg, %x, cc
if (mi_match(Reg, MRI, m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)))) {
Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
Reg = MatchReg;
if (Invert) {
CC = AArch64CC::getInvertedCondCode(CC);
std::swap(Reg, OtherReg);
}
return true;
}
return false;
};
// Helper lambda which tries to use CSINC/CSINV for the instruction when its
// true/false values are constants.
// FIXME: All of these patterns already exist in tablegen. We should be
// able to import these.
auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
&Optimized]() {
if (Optimized)
return false;
auto TrueCst = getConstantVRegValWithLookThrough(True, MRI);
auto FalseCst = getConstantVRegValWithLookThrough(False, MRI);
if (!TrueCst && !FalseCst)
return false;
Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
if (TrueCst && FalseCst) {
int64_t T = TrueCst->Value.getSExtValue();
int64_t F = FalseCst->Value.getSExtValue();
if (T == 0 && F == 1) {
// G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
True = ZReg;
False = ZReg;
return true;
}
if (T == 0 && F == -1) {
// G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
True = ZReg;
False = ZReg;
return true;
}
}
if (TrueCst) {
int64_t T = TrueCst->Value.getSExtValue();
if (T == 1) {
// G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
True = False;
False = ZReg;
CC = AArch64CC::getInvertedCondCode(CC);
return true;
}
if (T == -1) {
// G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
True = False;
False = ZReg;
CC = AArch64CC::getInvertedCondCode(CC);
return true;
}
}
if (FalseCst) {
int64_t F = FalseCst->Value.getSExtValue();
if (F == 1) {
// G_SELECT cc, t, 1 -> CSINC t, zreg, cc
Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
False = ZReg;
return true;
}
if (F == -1) {
// G_SELECT cc, t, -1 -> CSINC t, zreg, cc
Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
False = ZReg;
return true;
}
}
return false;
};
Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
Optimized |= TryOptSelectCst();
auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
return &*SelectInst;
}
static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
switch (P) {
default:
llvm_unreachable("Unknown condition code!");
case CmpInst::ICMP_NE:
return AArch64CC::NE;
case CmpInst::ICMP_EQ:
return AArch64CC::EQ;
case CmpInst::ICMP_SGT:
return AArch64CC::GT;
case CmpInst::ICMP_SGE:
return AArch64CC::GE;
case CmpInst::ICMP_SLT:
return AArch64CC::LT;
case CmpInst::ICMP_SLE:
return AArch64CC::LE;
case CmpInst::ICMP_UGT:
return AArch64CC::HI;
case CmpInst::ICMP_UGE:
return AArch64CC::HS;
case CmpInst::ICMP_ULT:
return AArch64CC::LO;
case CmpInst::ICMP_ULE:
return AArch64CC::LS;
}
}
static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (P) {
default:
llvm_unreachable("Unknown FP condition!");
case CmpInst::FCMP_OEQ:
CondCode = AArch64CC::EQ;
break;
case CmpInst::FCMP_OGT:
CondCode = AArch64CC::GT;
break;
case CmpInst::FCMP_OGE:
CondCode = AArch64CC::GE;
break;
case CmpInst::FCMP_OLT:
CondCode = AArch64CC::MI;
break;
case CmpInst::FCMP_OLE:
CondCode = AArch64CC::LS;
break;
case CmpInst::FCMP_ONE:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GT;
break;
case CmpInst::FCMP_ORD:
CondCode = AArch64CC::VC;
break;
case CmpInst::FCMP_UNO:
CondCode = AArch64CC::VS;
break;
case CmpInst::FCMP_UEQ:
CondCode = AArch64CC::EQ;
CondCode2 = AArch64CC::VS;
break;
case CmpInst::FCMP_UGT:
CondCode = AArch64CC::HI;
break;
case CmpInst::FCMP_UGE:
CondCode = AArch64CC::PL;
break;
case CmpInst::FCMP_ULT:
CondCode = AArch64CC::LT;
break;
case CmpInst::FCMP_ULE:
CondCode = AArch64CC::LE;
break;
case CmpInst::FCMP_UNE:
CondCode = AArch64CC::NE;
break;
}
}
/// Return a register which can be used as a bit to test in a TB(N)Z.
static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
MachineRegisterInfo &MRI) {
assert(Reg.isValid() && "Expected valid register!");
while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
unsigned Opc = MI->getOpcode();
if (!MI->getOperand(0).isReg() ||
!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
break;
// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
//
// (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
// on the truncated x is the same as the bit number on x.
if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
Opc == TargetOpcode::G_TRUNC) {
Register NextReg = MI->getOperand(1).getReg();
// Did we find something worth folding?
if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
break;
// NextReg is worth folding. Keep looking.
Reg = NextReg;
continue;
}
// Attempt to find a suitable operation with a constant on one side.
Optional<uint64_t> C;
Register TestReg;
switch (Opc) {
default:
break;
case TargetOpcode::G_AND:
case TargetOpcode::G_XOR: {
TestReg = MI->getOperand(1).getReg();
Register ConstantReg = MI->getOperand(2).getReg();
auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
if (!VRegAndVal) {
// AND commutes, check the other side for a constant.
// FIXME: Can we canonicalize the constant so that it's always on the
// same side at some point earlier?
std::swap(ConstantReg, TestReg);
VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
}
if (VRegAndVal)
C = VRegAndVal->Value.getSExtValue();
break;
}
case TargetOpcode::G_ASHR:
case TargetOpcode::G_LSHR:
case TargetOpcode::G_SHL: {
TestReg = MI->getOperand(1).getReg();
auto VRegAndVal =
getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
if (VRegAndVal)
C = VRegAndVal->Value.getSExtValue();
break;
}
}
// Didn't find a constant or viable register. Bail out of the loop.
if (!C || !TestReg.isValid())
break;
// We found a suitable instruction with a constant. Check to see if we can
// walk through the instruction.
Register NextReg;
unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
switch (Opc) {
default:
break;
case TargetOpcode::G_AND:
// (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
if ((*C >> Bit) & 1)
NextReg = TestReg;
break;
case TargetOpcode::G_SHL:
// (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
// the type of the register.
if (*C <= Bit && (Bit - *C) < TestRegSize) {
NextReg = TestReg;
Bit = Bit - *C;
}
break;
case TargetOpcode::G_ASHR:
// (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
// in x
NextReg = TestReg;
Bit = Bit + *C;
if (Bit >= TestRegSize)
Bit = TestRegSize - 1;
break;
case TargetOpcode::G_LSHR:
// (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
if ((Bit + *C) < TestRegSize) {
NextReg = TestReg;
Bit = Bit + *C;
}
break;
case TargetOpcode::G_XOR:
// We can walk through a G_XOR by inverting whether we use tbz/tbnz when
// appropriate.
//
// e.g. If x' = xor x, c, and the b-th bit is set in c then
//
// tbz x', b -> tbnz x, b
//
// Because x' only has the b-th bit set if x does not.
if ((*C >> Bit) & 1)
Invert = !Invert;
NextReg = TestReg;
break;
}
// Check if we found anything worth folding.
if (!NextReg.isValid())
return Reg;
Reg = NextReg;
}
return Reg;
}
MachineInstr *AArch64InstructionSelector::emitTestBit(
Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
MachineIRBuilder &MIB) const {
assert(TestReg.isValid());
assert(ProduceNonFlagSettingCondBr &&
"Cannot emit TB(N)Z with speculation tracking!");
MachineRegisterInfo &MRI = *MIB.getMRI();
// Attempt to optimize the test bit by walking over instructions.
TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
LLT Ty = MRI.getType(TestReg);
unsigned Size = Ty.getSizeInBits();
assert(!Ty.isVector() && "Expected a scalar!");
assert(Bit < 64 && "Bit is too large!");
// When the test register is a 64-bit register, we have to narrow to make
// TBNZW work.
bool UseWReg = Bit < 32;
unsigned NecessarySize = UseWReg ? 32 : 64;
if (Size != NecessarySize)
TestReg = moveScalarRegClass(
TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
MIB);
static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
{AArch64::TBZW, AArch64::TBNZW}};
unsigned Opc = OpcTable[UseWReg][IsNegative];
auto TestBitMI =
MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
return &*TestBitMI;
}
bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
MachineIRBuilder &MIB) const {
assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
// Given something like this:
//
// %x = ...Something...
// %one = G_CONSTANT i64 1
// %zero = G_CONSTANT i64 0
// %and = G_AND %x, %one
// %cmp = G_ICMP intpred(ne), %and, %zero
// %cmp_trunc = G_TRUNC %cmp
// G_BRCOND %cmp_trunc, %bb.3
//
// We want to try and fold the AND into the G_BRCOND and produce either a
// TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
//
// In this case, we'd get
//
// TBNZ %x %bb.3
//
// Check if the AND has a constant on its RHS which we can use as a mask.
// If it's a power of 2, then it's the same as checking a specific bit.
// (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
auto MaybeBit = getConstantVRegValWithLookThrough(
AndInst.getOperand(2).getReg(), *MIB.getMRI());
if (!MaybeBit)
return false;
int32_t Bit = MaybeBit->Value.exactLogBase2();
if (Bit < 0)
return false;
Register TestReg = AndInst.getOperand(1).getReg();
// Emit a TB(N)Z.
emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
return true;
}
MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
bool IsNegative,
MachineBasicBlock *DestMBB,
MachineIRBuilder &MIB) const {
assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
MachineRegisterInfo &MRI = *MIB.getMRI();
assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
AArch64::GPRRegBankID &&
"Expected GPRs only?");
auto Ty = MRI.getType(CompareReg);
unsigned Width = Ty.getSizeInBits();
assert(!Ty.isVector() && "Expected scalar only?");
assert(Width <= 64 && "Expected width to be at most 64?");
static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
{AArch64::CBNZW, AArch64::CBNZX}};
unsigned Opc = OpcTable[IsNegative][Width == 64];
auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
return &*BranchMI;
}
bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
assert(I.getOpcode() == TargetOpcode::G_BRCOND);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
// totally clean. Some of them require two branches to implement.
emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB);
AArch64CC::CondCode CC1, CC2;
changeFCMPPredToAArch64CC(
static_cast<CmpInst::Predicate>(FCmp.getOperand(1).getPredicate()), CC1,
CC2);
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
if (CC2 != AArch64CC::AL)
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
assert(I.getOpcode() == TargetOpcode::G_BRCOND);
// Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
//
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
// instructions will not be produced, as they are conditional branch
// instructions that do not set flags.
if (!ProduceNonFlagSettingCondBr)
return false;
MachineRegisterInfo &MRI = *MIB.getMRI();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
auto Pred =
static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
Register LHS = ICmp.getOperand(2).getReg();
Register RHS = ICmp.getOperand(3).getReg();
// We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
// When we can emit a TB(N)Z, prefer that.
//
// Handle non-commutative condition codes first.
// Note that we don't want to do this when we have a G_AND because it can
// become a tst. The tst will make the test bit in the TB(N)Z redundant.
if (VRegAndVal && !AndInst) {
int64_t C = VRegAndVal->Value.getSExtValue();
// When we have a greater-than comparison, we can just test if the msb is
// zero.
if (C == -1 && Pred == CmpInst::ICMP_SGT) {
uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
I.eraseFromParent();
return true;
}
// When we have a less than comparison, we can just test if the msb is not
// zero.
if (C == 0 && Pred == CmpInst::ICMP_SLT) {
uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
I.eraseFromParent();
return true;
}
}
// Attempt to handle commutative condition codes. Right now, that's only
// eq/ne.
if (ICmpInst::isEquality(Pred)) {
if (!VRegAndVal) {
std::swap(RHS, LHS);
VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
}
if (VRegAndVal && VRegAndVal->Value == 0) {
// If there's a G_AND feeding into this branch, try to fold it away by
// emitting a TB(N)Z instead.
//
// Note: If we have LT, then it *is* possible to fold, but it wouldn't be
// beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
// would be redundant.
if (AndInst &&
tryOptAndIntoCompareBranch(
*AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
I.eraseFromParent();
return true;
}
// Otherwise, try to emit a CB(N)Z instead.
auto LHSTy = MRI.getType(LHS);
if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
I.eraseFromParent();
return true;
}
}
}
return false;
}
bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
assert(I.getOpcode() == TargetOpcode::G_BRCOND);
if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
return true;
// Couldn't optimize. Emit a compare + a Bcc.
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
auto PredOp = ICmp.getOperand(1);
emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectCompareBranch(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
Register CondReg = I.getOperand(0).getReg();
MachineInstr *CCMI = MRI.getVRegDef(CondReg);
if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
CondReg = CCMI->getOperand(1).getReg();
CCMI = MRI.getVRegDef(CondReg);
}
// Try to select the G_BRCOND using whatever is feeding the condition if
// possible.
MachineIRBuilder MIB(I);
unsigned CCMIOpc = CCMI->getOpcode();
if (CCMIOpc == TargetOpcode::G_FCMP)
return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
if (CCMIOpc == TargetOpcode::G_ICMP)
return selectCompareBranchFedByICmp(I, *CCMI, MIB);
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
// instructions will not be produced, as they are conditional branch
// instructions that do not set flags.
if (ProduceNonFlagSettingCondBr) {
emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
I.getOperand(1).getMBB(), MIB);
I.eraseFromParent();
return true;
}
// Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
auto TstMI =
MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
auto Bcc = MIB.buildInstr(AArch64::Bcc)
.addImm(AArch64CC::EQ)
.addMBB(I.getOperand(1).getMBB());
I.eraseFromParent();
return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
}
/// Returns the element immediate value of a vector shift operand if found.
/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
static Optional<int64_t> getVectorShiftImm(Register Reg,
MachineRegisterInfo &MRI) {
assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
MachineInstr *OpMI = MRI.getVRegDef(Reg);
assert(OpMI && "Expected to find a vreg def for vector shift operand");
if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR)
return None;
// Check all operands are identical immediates.
int64_t ImmVal = 0;
for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) {
auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI);
if (!VRegAndVal)
return None;
if (Idx == 1)
ImmVal = VRegAndVal->Value.getSExtValue();
if (ImmVal != VRegAndVal->Value.getSExtValue())
return None;
}
return ImmVal;
}
/// Matches and returns the shift immediate value for a SHL instruction given
/// a shift operand.
static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
if (!ShiftImm)
return None;
// Check the immediate is in range for a SHL.
int64_t Imm = *ShiftImm;
if (Imm < 0)
return None;
switch (SrcTy.getElementType().getSizeInBits()) {
default:
LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
return None;
case 8:
if (Imm > 7)
return None;
break;
case 16:
if (Imm > 15)
return None;
break;
case 32:
if (Imm > 31)
return None;
break;
case 64:
if (Imm > 63)
return None;
break;
}
return Imm;
}
bool AArch64InstructionSelector::selectVectorSHL(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_SHL);
Register DstReg = I.getOperand(0).getReg();
const LLT Ty = MRI.getType(DstReg);
Register Src1Reg = I.getOperand(1).getReg();
Register Src2Reg = I.getOperand(2).getReg();
if (!Ty.isVector())
return false;
// Check if we have a vector of constants on RHS that we can select as the
// immediate form.
Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
unsigned Opc = 0;
if (Ty == LLT::vector(2, 64)) {
Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
} else if (Ty == LLT::vector(4, 32)) {
Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
} else if (Ty == LLT::vector(2, 32)) {
Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
} else if (Ty == LLT::vector(4, 16)) {
Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
} else if (Ty == LLT::vector(8, 16)) {
Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
} else if (Ty == LLT::vector(16, 8)) {
Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
} else if (Ty == LLT::vector(8, 8)) {
Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
return false;
}
MachineIRBuilder MIB(I);
auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
if (ImmVal)
Shl.addImm(*ImmVal);
else
Shl.addUse(Src2Reg);
constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectVectorAshrLshr(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_ASHR ||
I.getOpcode() == TargetOpcode::G_LSHR);
Register DstReg = I.getOperand(0).getReg();
const LLT Ty = MRI.getType(DstReg);
Register Src1Reg = I.getOperand(1).getReg();
Register Src2Reg = I.getOperand(2).getReg();
if (!Ty.isVector())
return false;
bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
// We expect the immediate case to be lowered in the PostLegalCombiner to
// AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
// There is not a shift right register instruction, but the shift left
// register instruction takes a signed value, where negative numbers specify a
// right shift.
unsigned Opc = 0;
unsigned NegOpc = 0;
const TargetRegisterClass *RC =
getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
if (Ty == LLT::vector(2, 64)) {
Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
NegOpc = AArch64::NEGv2i64;
} else if (Ty == LLT::vector(4, 32)) {
Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
NegOpc = AArch64::NEGv4i32;
} else if (Ty == LLT::vector(2, 32)) {
Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
NegOpc = AArch64::NEGv2i32;
} else if (Ty == LLT::vector(4, 16)) {
Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
NegOpc = AArch64::NEGv4i16;
} else if (Ty == LLT::vector(8, 16)) {
Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
NegOpc = AArch64::NEGv8i16;
} else if (Ty == LLT::vector(16, 8)) {
Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
NegOpc = AArch64::NEGv8i16;
} else if (Ty == LLT::vector(8, 8)) {
Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
NegOpc = AArch64::NEGv8i8;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
return false;
}
MachineIRBuilder MIB(I);
auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectVaStartAAPCS(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
return false;
}
bool AArch64InstructionSelector::selectVaStartDarwin(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
Register ListReg = I.getOperand(0).getReg();
Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
auto MIB =
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
.addDef(ArgsAddrReg)
.addFrameIndex(FuncInfo->getVarArgsStackIndex())
.addImm(0)
.addImm(0);
constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
.addUse(ArgsAddrReg)
.addUse(ListReg)
.addImm(0)
.addMemOperand(*I.memoperands_begin());
constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
void AArch64InstructionSelector::materializeLargeCMVal(
MachineInstr &I, const Value *V, unsigned OpFlags) const {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineIRBuilder MIB(I);
[GISel]: Refactor MachineIRBuilder to allow passing additional parameters to build Instrs https://reviews.llvm.org/D55294 Previously MachineIRBuilder::buildInstr used to accept variadic arguments for sources (which were either unsigned or MachineInstrBuilder). While this worked well in common cases, it doesn't allow us to build instructions that have multiple destinations. Additionally passing in other optional parameters in the end (such as flags) is not possible trivially. Also a trivial call such as B.buildInstr(Opc, Reg1, Reg2, Reg3) can be interpreted differently based on the opcode (2defs + 1 src for unmerge vs 1 def + 2srcs). This patch refactors the buildInstr to buildInstr(Opc, ArrayRef<DstOps>, ArrayRef<SrcOps>) where DstOps and SrcOps are typed unions that know how to add itself to MachineInstrBuilder. After this patch, most invocations would look like B.buildInstr(Opc, {s32, DstReg}, {SrcRegs..., SrcMIBs..}); Now all the other calls (such as buildAdd, buildSub etc) forward to buildInstr. It also makes it possible to build instructions with multiple defs. Additionally in a subsequent patch, we should make it possible to add flags directly while building instructions. Additionally, the main buildInstr method is now virtual and other builders now only have to override buildInstr (for say constant folding/cseing) is straightforward. Also attached here (https://reviews.llvm.org/F7675680) is a clang-tidy patch that should upgrade the API calls if necessary. llvm-svn: 348815
2018-12-11 08:48:50 +08:00
auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
MovZ->addOperand(MF, I.getOperand(1));
MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
AArch64II::MO_NC);
MovZ->addOperand(MF, MachineOperand::CreateImm(0));
constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
Register ForceDstReg) {
Register DstReg = ForceDstReg
? ForceDstReg
: MRI.createVirtualRegister(&AArch64::GPR64RegClass);
auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
if (auto *GV = dyn_cast<GlobalValue>(V)) {
MovI->addOperand(MF, MachineOperand::CreateGA(
GV, MovZ->getOperand(1).getOffset(), Flags));
} else {
MovI->addOperand(
MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
MovZ->getOperand(1).getOffset(), Flags));
}
MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
return DstReg;
};
Register DstReg = BuildMovK(MovZ.getReg(0),
AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
return;
}
bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
case TargetOpcode::G_ASHR:
case TargetOpcode::G_LSHR: {
// These shifts are legalized to have 64 bit shift amounts because we want
// to take advantage of the existing imported selection patterns that assume
// the immediates are s64s. However, if the shifted type is 32 bits and for
// some reason we receive input GMIR that has an s64 shift amount that's not
// a G_CONSTANT, insert a truncate so that we can still select the s32
// register-register variant.
Register SrcReg = I.getOperand(1).getReg();
Register ShiftReg = I.getOperand(2).getReg();
const LLT ShiftTy = MRI.getType(ShiftReg);
const LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.isVector())
return false;
assert(!ShiftTy.isVector() && "unexpected vector shift ty");
if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
return false;
auto *AmtMI = MRI.getVRegDef(ShiftReg);
assert(AmtMI && "could not find a vreg definition for shift amount");
if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
// Insert a subregister copy to implement a 64->32 trunc
MachineIRBuilder MIB(I);
auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
.addReg(ShiftReg, 0, AArch64::sub_32);
MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
I.getOperand(2).setReg(Trunc.getReg(0));
}
return true;
}
case TargetOpcode::G_STORE:
return contractCrossBankCopyIntoStore(I, MRI);
case TargetOpcode::G_PTR_ADD:
return convertPtrAddToAdd(I, MRI);
case TargetOpcode::G_LOAD: {
// For scalar loads of pointers, we try to convert the dest type from p0
// to s64 so that our imported patterns can match. Like with the G_PTR_ADD
// conversion, this should be ok because all users should have been
// selected already, so the type doesn't matter for them.
Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
if (!DstTy.isPointer())
return false;
MRI.setType(DstReg, LLT::scalar(64));
return true;
}
case AArch64::G_DUP: {
// Convert the type from p0 to s64 to help selection.
LLT DstTy = MRI.getType(I.getOperand(0).getReg());
if (!DstTy.getElementType().isPointer())
return false;
MachineIRBuilder MIB(I);
auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
MRI.setType(I.getOperand(0).getReg(),
DstTy.changeElementType(LLT::scalar(64)));
MRI.setRegBank(NewSrc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
I.getOperand(1).setReg(NewSrc.getReg(0));
return true;
}
default:
return false;
}
}
/// This lowering tries to look for G_PTR_ADD instructions and then converts
[AArch64][GlobalISel] Don't reconvert to p0 in convertPtrAddToAdd(). convertPtrAddToAdd improved overall code size and quality by a significant amount, but on -O0 we generate some cross-class copies due to the fact that we emitted G_PTRTOINT and G_INTTOPTR around the G_ADD. Unfortunately at -O0 we don't run any register coalescing, so these cross class copies end up escaping as moves, and we ended up regressing 3 benchmarks on CTMark (though still a winner overall). This patch changes the lowering to instead directly emit the G_ADD into the destination register, and then force changes the dest LLT to s64 from p0. This should be ok, as all uses of the register should now be selected and therefore the LLT doesn't matter for the users. It does however matter for the importer patterns, which will fail to select a G_ADD if there's a p0 LLT. I'm not able to get rid of the G_PTRTOINT on the source yet however. We can't use the same trick of breaking the type system since that could break the selection of the defining instruction. Thus with -O0 we still end up with a cross class copy on source. Code size improvements on -O0: Program baseline new diff test-suite :: CTMark/Bullet/bullet.test 965520 949164 -1.7% test-suite...TMark/7zip/7zip-benchmark.test 1069456 1052600 -1.6% test-suite...ark/tramp3d-v4/tramp3d-v4.test 1213692 1199804 -1.1% test-suite...:: CTMark/sqlite3/sqlite3.test 421680 419736 -0.5% test-suite...-typeset/consumer-typeset.test 837076 833380 -0.4% test-suite :: CTMark/lencod/lencod.test 799712 796976 -0.3% test-suite...:: CTMark/ClamAV/clamscan.test 688264 686132 -0.3% test-suite :: CTMark/kimwitu++/kc.test 1002344 999648 -0.3% test-suite...Mark/mafft/pairlocalalign.test 422296 421768 -0.1% test-suite :: CTMark/SPASS/SPASS.test 656792 656532 -0.0% Geomean difference -0.6% Differential Revision: https://reviews.llvm.org/D73910
2020-02-04 02:32:25 +08:00
/// them to a standard G_ADD with a COPY on the source.
///
/// The motivation behind this is to expose the add semantics to the imported
/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
/// because the selector works bottom up, uses before defs. By the time we
/// end up trying to select a G_PTR_ADD, we should have already attempted to
/// fold this into addressing modes and were therefore unsuccessful.
bool AArch64InstructionSelector::convertPtrAddToAdd(
MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
Register DstReg = I.getOperand(0).getReg();
Register AddOp1Reg = I.getOperand(1).getReg();
const LLT PtrTy = MRI.getType(DstReg);
if (PtrTy.getAddressSpace() != 0)
return false;
MachineIRBuilder MIB(I);
const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64);
auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
// Set regbanks on the registers.
if (PtrTy.isVector())
MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
else
MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
[AArch64][GlobalISel] Don't reconvert to p0 in convertPtrAddToAdd(). convertPtrAddToAdd improved overall code size and quality by a significant amount, but on -O0 we generate some cross-class copies due to the fact that we emitted G_PTRTOINT and G_INTTOPTR around the G_ADD. Unfortunately at -O0 we don't run any register coalescing, so these cross class copies end up escaping as moves, and we ended up regressing 3 benchmarks on CTMark (though still a winner overall). This patch changes the lowering to instead directly emit the G_ADD into the destination register, and then force changes the dest LLT to s64 from p0. This should be ok, as all uses of the register should now be selected and therefore the LLT doesn't matter for the users. It does however matter for the importer patterns, which will fail to select a G_ADD if there's a p0 LLT. I'm not able to get rid of the G_PTRTOINT on the source yet however. We can't use the same trick of breaking the type system since that could break the selection of the defining instruction. Thus with -O0 we still end up with a cross class copy on source. Code size improvements on -O0: Program baseline new diff test-suite :: CTMark/Bullet/bullet.test 965520 949164 -1.7% test-suite...TMark/7zip/7zip-benchmark.test 1069456 1052600 -1.6% test-suite...ark/tramp3d-v4/tramp3d-v4.test 1213692 1199804 -1.1% test-suite...:: CTMark/sqlite3/sqlite3.test 421680 419736 -0.5% test-suite...-typeset/consumer-typeset.test 837076 833380 -0.4% test-suite :: CTMark/lencod/lencod.test 799712 796976 -0.3% test-suite...:: CTMark/ClamAV/clamscan.test 688264 686132 -0.3% test-suite :: CTMark/kimwitu++/kc.test 1002344 999648 -0.3% test-suite...Mark/mafft/pairlocalalign.test 422296 421768 -0.1% test-suite :: CTMark/SPASS/SPASS.test 656792 656532 -0.0% Geomean difference -0.6% Differential Revision: https://reviews.llvm.org/D73910
2020-02-04 02:32:25 +08:00
// Now turn the %dst(p0) = G_PTR_ADD %base, off into:
// %dst(intty) = G_ADD %intbase, off
[AArch64][GlobalISel] Don't reconvert to p0 in convertPtrAddToAdd(). convertPtrAddToAdd improved overall code size and quality by a significant amount, but on -O0 we generate some cross-class copies due to the fact that we emitted G_PTRTOINT and G_INTTOPTR around the G_ADD. Unfortunately at -O0 we don't run any register coalescing, so these cross class copies end up escaping as moves, and we ended up regressing 3 benchmarks on CTMark (though still a winner overall). This patch changes the lowering to instead directly emit the G_ADD into the destination register, and then force changes the dest LLT to s64 from p0. This should be ok, as all uses of the register should now be selected and therefore the LLT doesn't matter for the users. It does however matter for the importer patterns, which will fail to select a G_ADD if there's a p0 LLT. I'm not able to get rid of the G_PTRTOINT on the source yet however. We can't use the same trick of breaking the type system since that could break the selection of the defining instruction. Thus with -O0 we still end up with a cross class copy on source. Code size improvements on -O0: Program baseline new diff test-suite :: CTMark/Bullet/bullet.test 965520 949164 -1.7% test-suite...TMark/7zip/7zip-benchmark.test 1069456 1052600 -1.6% test-suite...ark/tramp3d-v4/tramp3d-v4.test 1213692 1199804 -1.1% test-suite...:: CTMark/sqlite3/sqlite3.test 421680 419736 -0.5% test-suite...-typeset/consumer-typeset.test 837076 833380 -0.4% test-suite :: CTMark/lencod/lencod.test 799712 796976 -0.3% test-suite...:: CTMark/ClamAV/clamscan.test 688264 686132 -0.3% test-suite :: CTMark/kimwitu++/kc.test 1002344 999648 -0.3% test-suite...Mark/mafft/pairlocalalign.test 422296 421768 -0.1% test-suite :: CTMark/SPASS/SPASS.test 656792 656532 -0.0% Geomean difference -0.6% Differential Revision: https://reviews.llvm.org/D73910
2020-02-04 02:32:25 +08:00
I.setDesc(TII.get(TargetOpcode::G_ADD));
MRI.setType(DstReg, CastPtrTy);
[AArch64][GlobalISel] Don't reconvert to p0 in convertPtrAddToAdd(). convertPtrAddToAdd improved overall code size and quality by a significant amount, but on -O0 we generate some cross-class copies due to the fact that we emitted G_PTRTOINT and G_INTTOPTR around the G_ADD. Unfortunately at -O0 we don't run any register coalescing, so these cross class copies end up escaping as moves, and we ended up regressing 3 benchmarks on CTMark (though still a winner overall). This patch changes the lowering to instead directly emit the G_ADD into the destination register, and then force changes the dest LLT to s64 from p0. This should be ok, as all uses of the register should now be selected and therefore the LLT doesn't matter for the users. It does however matter for the importer patterns, which will fail to select a G_ADD if there's a p0 LLT. I'm not able to get rid of the G_PTRTOINT on the source yet however. We can't use the same trick of breaking the type system since that could break the selection of the defining instruction. Thus with -O0 we still end up with a cross class copy on source. Code size improvements on -O0: Program baseline new diff test-suite :: CTMark/Bullet/bullet.test 965520 949164 -1.7% test-suite...TMark/7zip/7zip-benchmark.test 1069456 1052600 -1.6% test-suite...ark/tramp3d-v4/tramp3d-v4.test 1213692 1199804 -1.1% test-suite...:: CTMark/sqlite3/sqlite3.test 421680 419736 -0.5% test-suite...-typeset/consumer-typeset.test 837076 833380 -0.4% test-suite :: CTMark/lencod/lencod.test 799712 796976 -0.3% test-suite...:: CTMark/ClamAV/clamscan.test 688264 686132 -0.3% test-suite :: CTMark/kimwitu++/kc.test 1002344 999648 -0.3% test-suite...Mark/mafft/pairlocalalign.test 422296 421768 -0.1% test-suite :: CTMark/SPASS/SPASS.test 656792 656532 -0.0% Geomean difference -0.6% Differential Revision: https://reviews.llvm.org/D73910
2020-02-04 02:32:25 +08:00
I.getOperand(1).setReg(PtrToInt.getReg(0));
if (!select(*PtrToInt)) {
LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
return false;
}
// Also take the opportunity here to try to do some optimization.
// Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
Register NegatedReg;
if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
return true;
I.getOperand(2).setReg(NegatedReg);
I.setDesc(TII.get(TargetOpcode::G_SUB));
return true;
}
bool AArch64InstructionSelector::earlySelectSHL(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// We try to match the immediate variant of LSL, which is actually an alias
// for a special case of UBFM. Otherwise, we fall back to the imported
// selector which will match the register variant.
assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
const auto &MO = I.getOperand(2);
auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI);
if (!VRegAndVal)
return false;
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
if (DstTy.isVector())
return false;
bool Is64Bit = DstTy.getSizeInBits() == 64;
auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
MachineIRBuilder MIB(I);
if (!Imm1Fn || !Imm2Fn)
return false;
auto NewI =
MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
{I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
for (auto &RenderFn : *Imm1Fn)
RenderFn(NewI);
for (auto &RenderFn : *Imm2Fn)
RenderFn(NewI);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
}
bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
// If we're storing a scalar, it doesn't matter what register bank that
// scalar is on. All that matters is the size.
//
// So, if we see something like this (with a 32-bit scalar as an example):
//
// %x:gpr(s32) = ... something ...
// %y:fpr(s32) = COPY %x:gpr(s32)
// G_STORE %y:fpr(s32)
//
// We can fix this up into something like this:
//
// G_STORE %x:gpr(s32)
//
// And then continue the selection process normally.
Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
if (!DefDstReg.isValid())
return false;
LLT DefDstTy = MRI.getType(DefDstReg);
Register StoreSrcReg = I.getOperand(0).getReg();
LLT StoreSrcTy = MRI.getType(StoreSrcReg);
// If we get something strange like a physical register, then we shouldn't
// go any further.
if (!DefDstTy.isValid())
return false;
// Are the source and dst types the same size?
if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
return false;
if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
RBI.getRegBank(DefDstReg, MRI, TRI))
return false;
// We have a cross-bank copy, which is entering a store. Let's fold it.
I.getOperand(0).setReg(DefDstReg);
return true;
}
bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
switch (I.getOpcode()) {
case TargetOpcode::G_BR: {
// If the branch jumps to the fallthrough block, don't bother emitting it.
// Only do this for -O0 for a good code size improvement, because when
// optimizations are enabled we want to leave this choice to
// MachineBlockPlacement.
bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None;
if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB()))
return false;
I.eraseFromParent();
return true;
}
case TargetOpcode::G_SHL:
return earlySelectSHL(I, MRI);
case TargetOpcode::G_CONSTANT: {
bool IsZero = false;
if (I.getOperand(1).isCImm())
IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
else if (I.getOperand(1).isImm())
IsZero = I.getOperand(1).getImm() == 0;
if (!IsZero)
return false;
Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
if (Ty.getSizeInBits() == 64) {
I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
} else if (Ty.getSizeInBits() == 32) {
I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
} else
return false;
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
}
default:
return false;
}
}
bool AArch64InstructionSelector::select(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
const AArch64Subtarget *Subtarget =
&static_cast<const AArch64Subtarget &>(MF.getSubtarget());
if (Subtarget->requiresStrictAlign()) {
// We don't support this feature yet.
LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
return false;
}
unsigned Opcode = I.getOpcode();
// G_PHI requires same handling as PHI
if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
// Certain non-generic instructions also need some special handling.
if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI.getType(DefReg);
const RegClassOrRegBank &RegClassOrBank =
MRI.getRegClassOrRegBank(DefReg);
const TargetRegisterClass *DefRC
= RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
if (!DefRC) {
if (!DefTy.isValid()) {
LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
return false;
}
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
}
}
I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
}
if (I.isCopy())
return selectCopy(I, TII, MRI, TRI, RBI);
return true;
}
if (I.getNumOperands() != I.getNumExplicitOperands()) {
LLVM_DEBUG(
dbgs() << "Generic instruction has unexpected implicit operands\n");
return false;
}
// Try to do some lowering before we start instruction selecting. These
// lowerings are purely transformations on the input G_MIR and so selection
// must continue after any modification of the instruction.
if (preISelLower(I)) {
Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
}
// There may be patterns where the importer can't deal with them optimally,
// but does select it to a suboptimal sequence so our custom C++ selection
// code later never has a chance to work on it. Therefore, we have an early
// selection attempt here to give priority to certain selection routines
// over the imported ones.
if (earlySelect(I))
return true;
if (selectImpl(I, *CoverageInfo))
return true;
LLT Ty =
I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
MachineIRBuilder MIB(I);
switch (Opcode) {
case TargetOpcode::G_BRCOND:
return selectCompareBranch(I, MF, MRI);
case TargetOpcode::G_BRINDIRECT: {
I.setDesc(TII.get(AArch64::BR));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_BRJT:
return selectBrJT(I, MRI);
case AArch64::G_ADD_LOW: {
// This op may have been separated from it's ADRP companion by the localizer
// or some other code motion pass. Given that many CPUs will try to
// macro fuse these operations anyway, select this into a MOVaddr pseudo
// which will later be expanded into an ADRP+ADD pair after scheduling.
MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
if (BaseMI->getOpcode() != AArch64::ADRP) {
I.setDesc(TII.get(AArch64::ADDXri));
I.addOperand(MachineOperand::CreateImm(0));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
assert(TM.getCodeModel() == CodeModel::Small &&
"Expected small code model");
MachineIRBuilder MIB(I);
auto Op1 = BaseMI->getOperand(1);
auto Op2 = I.getOperand(2);
auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
.addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
Op1.getTargetFlags())
.addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
Op2.getTargetFlags());
I.eraseFromParent();
return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
}
case TargetOpcode::G_BSWAP: {
// Handle vector types for G_BSWAP directly.
Register DstReg = I.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
// We should only get vector types here; everything else is handled by the
// importer right now.
if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
return false;
}
// Only handle 4 and 2 element vectors for now.
// TODO: 16-bit elements.
unsigned NumElts = DstTy.getNumElements();
if (NumElts != 4 && NumElts != 2) {
LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
return false;
}
// Choose the correct opcode for the supported types. Right now, that's
// v2s32, v4s32, and v2s64.
unsigned Opc = 0;
unsigned EltSize = DstTy.getElementType().getSizeInBits();
if (EltSize == 32)
Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
: AArch64::REV32v16i8;
else if (EltSize == 64)
Opc = AArch64::REV64v16i8;
// We should always get something by the time we get here...
assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_CONSTANT: {
const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
const LLT p0 = LLT::pointer(0, 64);
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI.getType(DefReg);
const unsigned DefSize = DefTy.getSizeInBits();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
// FIXME: Redundant check, but even less readable when factored out.
if (isFP) {
if (Ty != s32 && Ty != s64) {
LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
<< " constant, expected: " << s32 << " or " << s64
<< '\n');
return false;
}
if (RB.getID() != AArch64::FPRRegBankID) {
LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
<< " constant on bank: " << RB
<< ", expected: FPR\n");
return false;
}
// The case when we have 0.0 is covered by tablegen. Reject it here so we
// can be sure tablegen works correctly and isn't rescued by this code.
if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0))
return false;
} else {
// s32 and s64 are covered by tablegen.
if (Ty != p0 && Ty != s8 && Ty != s16) {
LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
<< " constant, expected: " << s32 << ", " << s64
<< ", or " << p0 << '\n');
return false;
}
if (RB.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
<< " constant on bank: " << RB
<< ", expected: GPR\n");
return false;
}
}
// We allow G_CONSTANT of types < 32b.
const unsigned MovOpc =
DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
if (isFP) {
// Either emit a FMOV, or emit a copy to emit a normal mov.
const TargetRegisterClass &GPRRC =
DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
const TargetRegisterClass &FPRRC =
DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
// Can we use a FMOV instruction to represent the immediate?
if (emitFMovForFConstant(I, MRI))
return true;
// For 64b values, emit a constant pool load instead.
if (DefSize == 64) {
auto *FPImm = I.getOperand(1).getFPImm();
MachineIRBuilder MIB(I);
auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
if (!LoadMI) {
LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
return false;
}
MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
I.eraseFromParent();
return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
}
// Nope. Emit a copy and use a normal mov instead.
const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
MachineOperand &RegOp = I.getOperand(0);
RegOp.setReg(DefGPRReg);
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
MIB.buildCopy({DefReg}, {DefGPRReg});
if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
return false;
}
MachineOperand &ImmOp = I.getOperand(1);
// FIXME: Is going through int64_t always correct?
ImmOp.ChangeToImmediate(
ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
[globalisel] Decouple src pattern operands from dst pattern operands. Summary: This isn't testable for AArch64 by itself so this patch also adds support for constant immediates in the pattern and physical register uses in the result. The new IntOperandMatcher matches the constant in patterns such as '(set $rd:GPR32, (G_XOR $rs:GPR32, -1))'. It's always safe to fold immediates into an instruction so this is the first rule that will match across multiple BB's. The Renderer hierarchy is responsible for adding operands to the result instruction. Renderers can copy operands (CopyRenderer) or add physical registers (in particular %wzr and %xzr) to the result instruction in any order (OperandMatchers now import the operand names from SelectionDAG to allow renderers to access any operand). This allows us to emit the result instruction for: %1 = G_XOR %0, -1 --> %1 = ORNWrr %wzr, %0 %1 = G_XOR -1, %0 --> %1 = ORNWrr %wzr, %0 although the latter is untested since the matcher/importer has not been taught about commutativity yet. Added BuildMIAction which can build new instructions and mutate them where possible. W.r.t the mutation aspect, MatchActions are now told the name of an instruction they can recycle and BuildMIAction will emit mutation code when the renderers are appropriate. They are appropriate when all operands are rendered using CopyRenderer and the indices are the same as the matcher. This currently assumes that all operands have at least one matcher. Finally, this change also fixes a crash in AArch64InstructionSelector::select() caused by an immediate operand passing isImm() rather than isCImm(). This was uncovered by the other changes and was detected by existing tests. Depends on D29711 Reviewers: t.p.northover, ab, qcolombet, rovka, aditya_nandakumar, javed.absar Reviewed By: rovka Subscribers: aemerson, dberris, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D29712 llvm-svn: 296131
2017-02-24 23:43:30 +08:00
} else if (I.getOperand(1).isCImm()) {
uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
I.getOperand(1).ChangeToImmediate(Val);
[globalisel] Decouple src pattern operands from dst pattern operands. Summary: This isn't testable for AArch64 by itself so this patch also adds support for constant immediates in the pattern and physical register uses in the result. The new IntOperandMatcher matches the constant in patterns such as '(set $rd:GPR32, (G_XOR $rs:GPR32, -1))'. It's always safe to fold immediates into an instruction so this is the first rule that will match across multiple BB's. The Renderer hierarchy is responsible for adding operands to the result instruction. Renderers can copy operands (CopyRenderer) or add physical registers (in particular %wzr and %xzr) to the result instruction in any order (OperandMatchers now import the operand names from SelectionDAG to allow renderers to access any operand). This allows us to emit the result instruction for: %1 = G_XOR %0, -1 --> %1 = ORNWrr %wzr, %0 %1 = G_XOR -1, %0 --> %1 = ORNWrr %wzr, %0 although the latter is untested since the matcher/importer has not been taught about commutativity yet. Added BuildMIAction which can build new instructions and mutate them where possible. W.r.t the mutation aspect, MatchActions are now told the name of an instruction they can recycle and BuildMIAction will emit mutation code when the renderers are appropriate. They are appropriate when all operands are rendered using CopyRenderer and the indices are the same as the matcher. This currently assumes that all operands have at least one matcher. Finally, this change also fixes a crash in AArch64InstructionSelector::select() caused by an immediate operand passing isImm() rather than isCImm(). This was uncovered by the other changes and was detected by existing tests. Depends on D29711 Reviewers: t.p.northover, ab, qcolombet, rovka, aditya_nandakumar, javed.absar Reviewed By: rovka Subscribers: aemerson, dberris, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D29712 llvm-svn: 296131
2017-02-24 23:43:30 +08:00
} else if (I.getOperand(1).isImm()) {
uint64_t Val = I.getOperand(1).getImm();
I.getOperand(1).ChangeToImmediate(Val);
}
I.setDesc(TII.get(MovOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
case TargetOpcode::G_EXTRACT: {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
LLT DstTy = MRI.getType(DstReg);
(void)DstTy;
unsigned SrcSize = SrcTy.getSizeInBits();
if (SrcTy.getSizeInBits() > 64) {
// This should be an extract of an s128, which is like a vector extract.
if (SrcTy.getSizeInBits() != 128)
return false;
// Only support extracting 64 bits from an s128 at the moment.
if (DstTy.getSizeInBits() != 64)
return false;
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
// Check we have the right regbank always.
assert(SrcRB.getID() == AArch64::FPRRegBankID &&
DstRB.getID() == AArch64::FPRRegBankID &&
"Wrong extract regbank!");
(void)SrcRB;
// Emit the same code as a vector extract.
// Offset must be a multiple of 64.
unsigned Offset = I.getOperand(2).getImm();
if (Offset % 64 != 0)
return false;
unsigned LaneIdx = Offset / 64;
MachineIRBuilder MIB(I);
MachineInstr *Extract = emitExtractVectorElt(
DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
if (!Extract)
return false;
I.eraseFromParent();
return true;
}
I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
Ty.getSizeInBits() - 1);
if (SrcSize < 64) {
assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
"unexpected G_EXTRACT types");
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
.addReg(DstReg, 0, AArch64::sub_32);
RBI.constrainGenericRegister(I.getOperand(0).getReg(),
AArch64::GPR32RegClass, MRI);
I.getOperand(0).setReg(DstReg);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_INSERT: {
LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
LLT DstTy = MRI.getType(I.getOperand(0).getReg());
unsigned DstSize = DstTy.getSizeInBits();
// Larger inserts are vectors, same-size ones should be something else by
// now (split up or turned into COPYs).
if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
return false;
I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
unsigned LSB = I.getOperand(3).getImm();
unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
I.getOperand(3).setImm((DstSize - LSB) % DstSize);
MachineInstrBuilder(MF, I).addImm(Width - 1);
if (DstSize < 64) {
assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
"unexpected G_INSERT types");
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG))
.addDef(SrcReg)
.addImm(0)
.addUse(I.getOperand(2).getReg())
.addImm(AArch64::sub_32);
RBI.constrainGenericRegister(I.getOperand(2).getReg(),
AArch64::GPR32RegClass, MRI);
I.getOperand(2).setReg(SrcReg);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_FRAME_INDEX: {
// allocas and G_FRAME_INDEX are only supported in addrspace(0).
if (Ty != LLT::pointer(0, 64)) {
LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
<< ", expected: " << LLT::pointer(0, 64) << '\n');
return false;
}
I.setDesc(TII.get(AArch64::ADDXri));
// MOs for a #0 shifted immediate.
I.addOperand(MachineOperand::CreateImm(0));
I.addOperand(MachineOperand::CreateImm(0));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_GLOBAL_VALUE: {
auto GV = I.getOperand(1).getGlobal();
if (GV->isThreadLocal())
return selectTLSGlobalValue(I, MRI);
unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
if (OpFlags & AArch64II::MO_GOT) {
I.setDesc(TII.get(AArch64::LOADgot));
I.getOperand(1).setTargetFlags(OpFlags);
} else if (TM.getCodeModel() == CodeModel::Large) {
// Materialize the global using movz/movk instructions.
materializeLargeCMVal(I, GV, OpFlags);
I.eraseFromParent();
return true;
} else if (TM.getCodeModel() == CodeModel::Tiny) {
I.setDesc(TII.get(AArch64::ADR));
I.getOperand(1).setTargetFlags(OpFlags);
} else {
I.setDesc(TII.get(AArch64::MOVaddr));
I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
MachineInstrBuilder MIB(MF, I);
MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
}
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_ZEXTLOAD:
case TargetOpcode::G_LOAD:
case TargetOpcode::G_STORE: {
bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
MachineIRBuilder MIB(I);
LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
if (PtrTy != LLT::pointer(0, 64)) {
LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
<< ", expected: " << LLT::pointer(0, 64) << '\n');
return false;
}
auto &MemOp = **I.memoperands_begin();
uint64_t MemSizeInBytes = MemOp.getSize();
if (MemOp.isAtomic()) {
// For now we just support s8 acquire loads to be able to compile stack
// protector code.
if (MemOp.getOrdering() == AtomicOrdering::Acquire &&
MemSizeInBytes == 1) {
I.setDesc(TII.get(AArch64::LDARB));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n");
return false;
}
unsigned MemSizeInBits = MemSizeInBytes * 8;
#ifndef NDEBUG
const Register PtrReg = I.getOperand(1).getReg();
const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
// Sanity-check the pointer register.
assert(PtrRB.getID() == AArch64::GPRRegBankID &&
"Load/Store pointer operand isn't a GPR");
assert(MRI.getType(PtrReg).isPointer() &&
"Load/Store pointer operand isn't a pointer");
#endif
const Register ValReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
// Helper lambda for partially selecting I. Either returns the original
// instruction with an updated opcode, or a new instruction.
auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
const unsigned NewOpc =
selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
if (NewOpc == I.getOpcode())
return nullptr;
// Check if we can fold anything into the addressing mode.
auto AddrModeFns =
selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
if (!AddrModeFns) {
// Can't fold anything. Use the original instruction.
I.setDesc(TII.get(NewOpc));
I.addOperand(MachineOperand::CreateImm(0));
return &I;
}
// Folded something. Create a new instruction and return it.
auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
NewInst.cloneMemRefs(I);
for (auto &Fn : *AddrModeFns)
Fn(NewInst);
I.eraseFromParent();
return &*NewInst;
};
MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
if (!LoadStore)
return false;
// If we're storing a 0, use WZR/XZR.
if (Opcode == TargetOpcode::G_STORE) {
auto CVal = getConstantVRegValWithLookThrough(
LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true,
/*HandleFConstants = */ false);
if (CVal && CVal->Value == 0) {
switch (LoadStore->getOpcode()) {
case AArch64::STRWui:
case AArch64::STRHHui:
case AArch64::STRBBui:
LoadStore->getOperand(0).setReg(AArch64::WZR);
break;
case AArch64::STRXui:
LoadStore->getOperand(0).setReg(AArch64::XZR);
break;
}
}
}
if (IsZExtLoad) {
// The zextload from a smaller type to i32 should be handled by the
// importer.
if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
return false;
// If we have a ZEXTLOAD then change the load's type to be a narrower reg
// and zero_extend with SUBREG_TO_REG.
Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
Register DstReg = LoadStore->getOperand(0).getReg();
LoadStore->getOperand(0).setReg(LdReg);
MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
.addImm(0)
.addUse(LdReg)
.addImm(AArch64::sub_32);
constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
MRI);
}
return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
}
case TargetOpcode::G_SMULH:
case TargetOpcode::G_UMULH: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
const Register DefReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
return false;
}
if (Ty != LLT::scalar(64)) {
LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
<< ", expected: " << LLT::scalar(64) << '\n');
return false;
}
unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
: AArch64::UMULHrr;
I.setDesc(TII.get(NewOpc));
// Now that we selected an opcode, we need to constrain the register
// operands to use appropriate classes.
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_LSHR:
case TargetOpcode::G_ASHR:
if (MRI.getType(I.getOperand(0).getReg()).isVector())
return selectVectorAshrLshr(I, MRI);
LLVM_FALLTHROUGH;
case TargetOpcode::G_SHL:
if (Opcode == TargetOpcode::G_SHL &&
MRI.getType(I.getOperand(0).getReg()).isVector())
return selectVectorSHL(I, MRI);
LLVM_FALLTHROUGH;
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_FMUL:
case TargetOpcode::G_FDIV:
case TargetOpcode::G_OR: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
const unsigned OpSize = Ty.getSizeInBits();
const Register DefReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
if (NewOpc == I.getOpcode())
return false;
I.setDesc(TII.get(NewOpc));
// FIXME: Should the type be always reset in setDesc?
// Now that we selected an opcode, we need to constrain the register
// operands to use appropriate classes.
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_PTR_ADD: {
MachineIRBuilder MIRBuilder(I);
emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2),
MIRBuilder);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_SADDO:
case TargetOpcode::G_UADDO:
case TargetOpcode::G_SSUBO: {
// Emit the operation and get the correct condition code.
MachineIRBuilder MIRBuilder(I);
auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
I.getOperand(2), I.getOperand(3), MIRBuilder);
// Now, put the overflow result in the register given by the first operand
// to the overflow op. CSINC increments the result when the predicate is
// false, so to get the increment when it's true, we need to use the
// inverse. In this case, we want to increment when carry is set.
Register ZReg = AArch64::WZR;
auto CsetMI = MIRBuilder
.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
{ZReg, ZReg})
.addImm(getInvertedCondCode(OpAndCC.second));
constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_PTRMASK: {
Register MaskReg = I.getOperand(2).getReg();
Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI);
// TODO: Implement arbitrary cases
if (!MaskVal || !isShiftedMask_64(*MaskVal))
return false;
uint64_t Mask = *MaskVal;
I.setDesc(TII.get(AArch64::ANDXri));
I.getOperand(2).ChangeToImmediate(
AArch64_AM::encodeLogicalImmediate(Mask, 64));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_PTRTOINT:
case TargetOpcode::G_TRUNC: {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
const Register DstReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
if (DstRB.getID() != SrcRB.getID()) {
LLVM_DEBUG(
dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
return false;
}
if (DstRB.getID() == AArch64::GPRRegBankID) {
const TargetRegisterClass *DstRC =
getRegClassForTypeOnBank(DstTy, DstRB, RBI);
if (!DstRC)
return false;
const TargetRegisterClass *SrcRC =
getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
if (!SrcRC)
return false;
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
return false;
}
if (DstRC == SrcRC) {
// Nothing to be done
} else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
SrcTy == LLT::scalar(64)) {
llvm_unreachable("TableGen can import this case");
return false;
} else if (DstRC == &AArch64::GPR32RegClass &&
SrcRC == &AArch64::GPR64RegClass) {
I.getOperand(1).setSubReg(AArch64::sub_32);
} else {
LLVM_DEBUG(
dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
return false;
}
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
} else if (DstRB.getID() == AArch64::FPRRegBankID) {
if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
I.setDesc(TII.get(AArch64::XTNv4i16));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
MachineIRBuilder MIB(I);
MachineInstr *Extract = emitExtractVectorElt(
DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
if (!Extract)
return false;
I.eraseFromParent();
return true;
}
// We might have a vector G_PTRTOINT, in which case just emit a COPY.
if (Opcode == TargetOpcode::G_PTRTOINT) {
assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
}
}
return false;
}
case TargetOpcode::G_ANYEXT: {
const Register DstReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
if (RBDst.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
<< ", expected: GPR\n");
return false;
}
const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
if (RBSrc.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
<< ", expected: GPR\n");
return false;
}
const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
if (DstSize == 0) {
LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
return false;
}
if (DstSize != 64 && DstSize > 32) {
LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
<< ", expected: 32 or 64\n");
return false;
}
// At this point G_ANYEXT is just like a plain COPY, but we need
// to explicitly form the 64-bit value if any.
if (DstSize > 32) {
Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
.addDef(ExtSrc)
.addImm(0)
.addUse(SrcReg)
.addImm(AArch64::sub_32);
I.getOperand(1).setReg(ExtSrc);
}
return selectCopy(I, TII, MRI, TRI, RBI);
}
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_SEXT_INREG:
case TargetOpcode::G_SEXT: {
unsigned Opcode = I.getOpcode();
const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
const Register DefReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DefReg);
const LLT SrcTy = MRI.getType(SrcReg);
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
// SEXT_INREG has the same src reg size as dst, the size of the value to be
// extended is encoded in the imm.
if (Opcode == TargetOpcode::G_SEXT_INREG)
SrcSize = I.getOperand(2).getImm();
if (DstTy.isVector())
return false; // Should be handled by imported patterns.
assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
AArch64::GPRRegBankID &&
"Unexpected ext regbank");
MachineIRBuilder MIB(I);
MachineInstr *ExtI;
// First check if we're extending the result of a load which has a dest type
// smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
// GPR register on AArch64 and all loads which are smaller automatically
// zero-extend the upper bits. E.g.
// %v(s8) = G_LOAD %p, :: (load 1)
// %v2(s32) = G_ZEXT %v(s8)
if (!IsSigned) {
auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
bool IsGPR =
RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
if (LoadMI && IsGPR) {
const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
unsigned BytesLoaded = MemOp->getSize();
if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
return selectCopy(I, TII, MRI, TRI, RBI);
}
// If we are zero extending from 32 bits to 64 bits, it's possible that
// the instruction implicitly does the zero extend for us. In that case,
// we can just emit a SUBREG_TO_REG.
if (IsGPR && SrcSize == 32 && DstSize == 64) {
// Unlike with the G_LOAD case, we don't want to look through copies
// here.
MachineInstr *Def = MRI.getVRegDef(SrcReg);
if (Def && isDef32(*Def)) {
MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
.addImm(0)
.addUse(SrcReg)
.addImm(AArch64::sub_32);
if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
return false;
}
if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
return false;
}
I.eraseFromParent();
return true;
}
}
}
if (DstSize == 64) {
if (Opcode != TargetOpcode::G_SEXT_INREG) {
// FIXME: Can we avoid manually doing this?
if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
<< " operand\n");
return false;
}
SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
{&AArch64::GPR64RegClass}, {})
.addImm(0)
.addUse(SrcReg)
.addImm(AArch64::sub_32)
.getReg(0);
}
ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
{DefReg}, {SrcReg})
.addImm(0)
.addImm(SrcSize - 1);
} else if (DstSize <= 32) {
ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
{DefReg}, {SrcReg})
.addImm(0)
.addImm(SrcSize - 1);
} else {
return false;
}
constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI: {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
SrcTy = MRI.getType(I.getOperand(1).getReg());
const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
if (NewOpc == Opcode)
return false;
I.setDesc(TII.get(NewOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
case TargetOpcode::G_FREEZE:
return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_INTTOPTR:
// The importer is currently unable to import pointer types since they
// didn't exist in SelectionDAG.
return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_BITCAST:
// Imported SelectionDAG rules can handle every bitcast except those that
// bitcast from a type to the same type. Ideally, these shouldn't occur
// but we might not run an optimizer that deletes them. The other exception
// is bitcasts involving pointer types, as SelectionDAG has no knowledge
// of them.
return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_SELECT: {
if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
<< ", expected: " << LLT::scalar(1) << '\n');
return false;
}
const Register CondReg = I.getOperand(1).getReg();
const Register TReg = I.getOperand(2).getReg();
const Register FReg = I.getOperand(3).getReg();
if (tryOptSelect(I))
return true;
// Make sure to use an unused vreg instead of wzr, so that the peephole
// optimizations will be able to optimize these.
MachineIRBuilder MIB(I);
Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
.addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
return false;
I.eraseFromParent();
return true;
}
case TargetOpcode::G_ICMP: {
if (Ty.isVector())
return selectVectorICmp(I, MRI);
if (Ty != LLT::scalar(32)) {
LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
<< ", expected: " << LLT::scalar(32) << '\n');
return false;
}
MachineIRBuilder MIRBuilder(I);
auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
MIRBuilder);
emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_FCMP: {
MachineIRBuilder MIRBuilder(I);
CmpInst::Predicate Pred =
static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(),
MIRBuilder) ||
!emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder))
return false;
I.eraseFromParent();
return true;
}
case TargetOpcode::G_VASTART:
return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
: selectVaStartAAPCS(I, MF, MRI);
case TargetOpcode::G_INTRINSIC:
return selectIntrinsic(I, MRI);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
return selectIntrinsicWithSideEffects(I, MRI);
case TargetOpcode::G_IMPLICIT_DEF: {
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const Register DstReg = I.getOperand(0).getReg();
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
const TargetRegisterClass *DstRC =
getRegClassForTypeOnBank(DstTy, DstRB, RBI);
RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
return true;
}
case TargetOpcode::G_BLOCK_ADDR: {
if (TM.getCodeModel() == CodeModel::Large) {
materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
I.eraseFromParent();
return true;
} else {
I.setDesc(TII.get(AArch64::MOVaddrBA));
auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
I.getOperand(0).getReg())
.addBlockAddress(I.getOperand(1).getBlockAddress(),
/* Offset */ 0, AArch64II::MO_PAGE)
.addBlockAddress(
I.getOperand(1).getBlockAddress(), /* Offset */ 0,
AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
}
case AArch64::G_DUP: {
// When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
// imported patterns. Do it manually here. Avoiding generating s16 gpr is
// difficult because at RBS we may end up pessimizing the fpr case if we
// decided to add an anyextend to fix this. Manual selection is the most
// robust solution for now.
Register SrcReg = I.getOperand(1).getReg();
if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::GPRRegBankID)
return false; // We expect the fpr regbank case to be imported.
LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.getSizeInBits() == 16)
I.setDesc(TII.get(AArch64::DUPv8i16gpr));
else if (SrcTy.getSizeInBits() == 8)
I.setDesc(TII.get(AArch64::DUPv16i8gpr));
else
return false;
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_INTRINSIC_TRUNC:
return selectIntrinsicTrunc(I, MRI);
case TargetOpcode::G_INTRINSIC_ROUND:
return selectIntrinsicRound(I, MRI);
case TargetOpcode::G_BUILD_VECTOR:
return selectBuildVector(I, MRI);
case TargetOpcode::G_MERGE_VALUES:
return selectMergeValues(I, MRI);
case TargetOpcode::G_UNMERGE_VALUES:
return selectUnmergeValues(I, MRI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return selectShuffleVector(I, MRI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return selectExtractElt(I, MRI);
case TargetOpcode::G_INSERT_VECTOR_ELT:
return selectInsertElt(I, MRI);
case TargetOpcode::G_CONCAT_VECTORS:
return selectConcatVectors(I, MRI);
case TargetOpcode::G_JUMP_TABLE:
return selectJumpTable(I, MRI);
case TargetOpcode::G_VECREDUCE_FADD:
case TargetOpcode::G_VECREDUCE_ADD:
return selectReduction(I, MRI);
}
return false;
}
bool AArch64InstructionSelector::selectReduction(
MachineInstr &I, MachineRegisterInfo &MRI) const {
Register VecReg = I.getOperand(1).getReg();
LLT VecTy = MRI.getType(VecReg);
if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
unsigned Opc = 0;
if (VecTy == LLT::vector(16, 8))
Opc = AArch64::ADDVv16i8v;
else if (VecTy == LLT::vector(8, 16))
Opc = AArch64::ADDVv8i16v;
else if (VecTy == LLT::vector(4, 32))
Opc = AArch64::ADDVv4i32v;
else if (VecTy == LLT::vector(2, 64))
Opc = AArch64::ADDPv2i64p;
else {
LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
return false;
}
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
unsigned Opc = 0;
if (VecTy == LLT::vector(2, 32))
Opc = AArch64::FADDPv2i32p;
else if (VecTy == LLT::vector(2, 64))
Opc = AArch64::FADDPv2i64p;
else {
LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
return false;
}
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
return false;
}
bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
Register JTAddr = I.getOperand(0).getReg();
unsigned JTI = I.getOperand(1).getIndex();
Register Index = I.getOperand(2).getReg();
MachineIRBuilder MIB(I);
Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
{TargetReg, ScratchReg}, {JTAddr, Index})
.addJumpTableIndex(JTI);
// Build the indirect branch.
MIB.buildInstr(AArch64::BR, {}, {TargetReg});
I.eraseFromParent();
return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectJumpTable(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
Register DstReg = I.getOperand(0).getReg();
unsigned JTI = I.getOperand(1).getIndex();
// We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
MachineIRBuilder MIB(I);
auto MovMI =
MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
.addJumpTableIndex(JTI, AArch64II::MO_PAGE)
.addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectTLSGlobalValue(
MachineInstr &I, MachineRegisterInfo &MRI) const {
if (!STI.isTargetMachO())
return false;
MachineFunction &MF = *I.getParent()->getParent();
MF.getFrameInfo().setAdjustsStack(true);
const GlobalValue &GV = *I.getOperand(1).getGlobal();
MachineIRBuilder MIB(I);
auto LoadGOT =
MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
.addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
{LoadGOT.getReg(0)})
.addImm(0);
MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
.addUse(AArch64::X0, RegState::Implicit)
.addDef(AArch64::X0, RegState::Implicit)
.addRegMask(TRI.getTLSCallPreservedMask());
MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
MRI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectIntrinsicTrunc(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
// Select the correct opcode.
unsigned Opc = 0;
if (!SrcTy.isVector()) {
switch (SrcTy.getSizeInBits()) {
default:
case 16:
Opc = AArch64::FRINTZHr;
break;
case 32:
Opc = AArch64::FRINTZSr;
break;
case 64:
Opc = AArch64::FRINTZDr;
break;
}
} else {
unsigned NumElts = SrcTy.getNumElements();
switch (SrcTy.getElementType().getSizeInBits()) {
default:
break;
case 16:
if (NumElts == 4)
Opc = AArch64::FRINTZv4f16;
else if (NumElts == 8)
Opc = AArch64::FRINTZv8f16;
break;
case 32:
if (NumElts == 2)
Opc = AArch64::FRINTZv2f32;
else if (NumElts == 4)
Opc = AArch64::FRINTZv4f32;
break;
case 64:
if (NumElts == 2)
Opc = AArch64::FRINTZv2f64;
break;
}
}
if (!Opc) {
// Didn't get an opcode above, bail.
LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
return false;
}
// Legalization would have set us up perfectly for this; we just need to
// set the opcode and move on.
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectIntrinsicRound(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
// Select the correct opcode.
unsigned Opc = 0;
if (!SrcTy.isVector()) {
switch (SrcTy.getSizeInBits()) {
default:
case 16:
Opc = AArch64::FRINTAHr;
break;
case 32:
Opc = AArch64::FRINTASr;
break;
case 64:
Opc = AArch64::FRINTADr;
break;
}
} else {
unsigned NumElts = SrcTy.getNumElements();
switch (SrcTy.getElementType().getSizeInBits()) {
default:
break;
case 16:
if (NumElts == 4)
Opc = AArch64::FRINTAv4f16;
else if (NumElts == 8)
Opc = AArch64::FRINTAv8f16;
break;
case 32:
if (NumElts == 2)
Opc = AArch64::FRINTAv2f32;
else if (NumElts == 4)
Opc = AArch64::FRINTAv4f32;
break;
case 64:
if (NumElts == 2)
Opc = AArch64::FRINTAv2f64;
break;
}
}
if (!Opc) {
// Didn't get an opcode above, bail.
LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
return false;
}
// Legalization would have set us up perfectly for this; we just need to
// set the opcode and move on.
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectVectorICmp(
MachineInstr &I, MachineRegisterInfo &MRI) const {
Register DstReg = I.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
Register SrcReg = I.getOperand(2).getReg();
Register Src2Reg = I.getOperand(3).getReg();
LLT SrcTy = MRI.getType(SrcReg);
unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
unsigned NumElts = DstTy.getNumElements();
// First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
// Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
// Third index is cc opcode:
// 0 == eq
// 1 == ugt
// 2 == uge
// 3 == ult
// 4 == ule
// 5 == sgt
// 6 == sge
// 7 == slt
// 8 == sle
// ne is done by negating 'eq' result.
// This table below assumes that for some comparisons the operands will be
// commuted.
// ult op == commute + ugt op
// ule op == commute + uge op
// slt op == commute + sgt op
// sle op == commute + sge op
unsigned PredIdx = 0;
bool SwapOperands = false;
CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
switch (Pred) {
case CmpInst::ICMP_NE:
case CmpInst::ICMP_EQ:
PredIdx = 0;
break;
case CmpInst::ICMP_UGT:
PredIdx = 1;
break;
case CmpInst::ICMP_UGE:
PredIdx = 2;
break;
case CmpInst::ICMP_ULT:
PredIdx = 3;
SwapOperands = true;
break;
case CmpInst::ICMP_ULE:
PredIdx = 4;
SwapOperands = true;
break;
case CmpInst::ICMP_SGT:
PredIdx = 5;
break;
case CmpInst::ICMP_SGE:
PredIdx = 6;
break;
case CmpInst::ICMP_SLT:
PredIdx = 7;
SwapOperands = true;
break;
case CmpInst::ICMP_SLE:
PredIdx = 8;
SwapOperands = true;
break;
default:
llvm_unreachable("Unhandled icmp predicate");
return false;
}
// This table obviously should be tablegen'd when we have our GISel native
// tablegen selector.
static const unsigned OpcTable[4][4][9] = {
{
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
{AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
},
{
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
{AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */}
},
{
{AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
{AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */}
},
{
{AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */}
},
};
unsigned EltIdx = Log2_32(SrcEltSize / 8);
unsigned NumEltsIdx = Log2_32(NumElts / 2);
unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
if (!Opc) {
LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
return false;
}
const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
const TargetRegisterClass *SrcRC =
getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
if (!SrcRC) {
LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
return false;
}
unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
if (SrcTy.getSizeInBits() == 128)
NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
if (SwapOperands)
std::swap(SrcReg, Src2Reg);
MachineIRBuilder MIB(I);
auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
// Invert if we had a 'ne' cc.
if (NotOpc) {
Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
} else {
MIB.buildCopy(DstReg, Cmp.getReg(0));
}
RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
I.eraseFromParent();
return true;
}
MachineInstr *AArch64InstructionSelector::emitScalarToVector(
unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
MachineIRBuilder &MIRBuilder) const {
auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
auto BuildFn = [&](unsigned SubregIndex) {
auto Ins =
MIRBuilder
.buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
.addImm(SubregIndex);
constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
return &*Ins;
};
switch (EltSize) {
case 16:
return BuildFn(AArch64::hsub);
case 32:
return BuildFn(AArch64::ssub);
case 64:
return BuildFn(AArch64::dsub);
default:
return nullptr;
}
}
bool AArch64InstructionSelector::selectMergeValues(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
if (I.getNumOperands() != 3)
return false;
// Merging 2 s64s into an s128.
if (DstTy == LLT::scalar(128)) {
if (SrcTy.getSizeInBits() != 64)
return false;
MachineIRBuilder MIB(I);
Register DstReg = I.getOperand(0).getReg();
Register Src1Reg = I.getOperand(1).getReg();
Register Src2Reg = I.getOperand(2).getReg();
auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
MachineInstr *InsMI =
emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
if (!InsMI)
return false;
MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
Src2Reg, /* LaneIdx */ 1, RB, MIB);
if (!Ins2MI)
return false;
constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
if (RB.getID() != AArch64::GPRRegBankID)
return false;
if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
return false;
auto *DstRC = &AArch64::GPR64RegClass;
Register SubToRegDef = MRI.createVirtualRegister(DstRC);
MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
.addDef(SubToRegDef)
.addImm(0)
.addUse(I.getOperand(1).getReg())
.addImm(AArch64::sub_32);
Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
// Need to anyext the second scalar before we can use bfm
MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
.addDef(SubToRegDef2)
.addImm(0)
.addUse(I.getOperand(2).getReg())
.addImm(AArch64::sub_32);
MachineInstr &BFM =
*BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
.addDef(I.getOperand(0).getReg())
.addUse(SubToRegDef)
.addUse(SubToRegDef2)
.addImm(32)
.addImm(31);
constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
const unsigned EltSize) {
// Choose a lane copy opcode and subregister based off of the size of the
// vector's elements.
switch (EltSize) {
case 16:
CopyOpc = AArch64::CPYi16;
ExtractSubReg = AArch64::hsub;
break;
case 32:
CopyOpc = AArch64::CPYi32;
ExtractSubReg = AArch64::ssub;
break;
case 64:
CopyOpc = AArch64::CPYi64;
ExtractSubReg = AArch64::dsub;
break;
default:
// Unknown size, bail out.
LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
return false;
}
return true;
}
MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
unsigned CopyOpc = 0;
unsigned ExtractSubReg = 0;
if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
LLVM_DEBUG(
dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
return nullptr;
}
const TargetRegisterClass *DstRC =
getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
if (!DstRC) {
LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
return nullptr;
}
const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
const LLT &VecTy = MRI.getType(VecReg);
const TargetRegisterClass *VecRC =
getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
if (!VecRC) {
LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
return nullptr;
}
// The register that we're going to copy into.
Register InsertReg = VecReg;
if (!DstReg)
DstReg = MRI.createVirtualRegister(DstRC);
// If the lane index is 0, we just use a subregister COPY.
if (LaneIdx == 0) {
auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
.addReg(VecReg, 0, ExtractSubReg);
RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
return &*Copy;
}
// Lane copies require 128-bit wide registers. If we're dealing with an
// unpacked vector, then we need to move up to that width. Insert an implicit
// def and a subregister insert to get us there.
if (VecTy.getSizeInBits() != 128) {
MachineInstr *ScalarToVector = emitScalarToVector(
VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
if (!ScalarToVector)
return nullptr;
InsertReg = ScalarToVector->getOperand(0).getReg();
}
MachineInstr *LaneCopyMI =
MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
// Make sure that we actually constrain the initial copy.
RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
return LaneCopyMI;
}
bool AArch64InstructionSelector::selectExtractElt(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
"unexpected opcode!");
Register DstReg = I.getOperand(0).getReg();
const LLT NarrowTy = MRI.getType(DstReg);
const Register SrcReg = I.getOperand(1).getReg();
const LLT WideTy = MRI.getType(SrcReg);
(void)WideTy;
assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
"source register size too small!");
assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
// Need the lane index to determine the correct copy opcode.
MachineOperand &LaneIdxOp = I.getOperand(2);
assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
return false;
}
// Find the index to extract from.
auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
if (!VRegAndVal)
return false;
unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
MachineIRBuilder MIRBuilder(I);
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
LaneIdx, MIRBuilder);
if (!Extract)
return false;
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectSplitVectorUnmerge(
MachineInstr &I, MachineRegisterInfo &MRI) const {
unsigned NumElts = I.getNumOperands() - 1;
Register SrcReg = I.getOperand(NumElts).getReg();
const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(SrcReg);
assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
if (SrcTy.getSizeInBits() > 128) {
LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
return false;
}
MachineIRBuilder MIB(I);
// We implement a split vector operation by treating the sub-vectors as
// scalars and extracting them.
const RegisterBank &DstRB =
*RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
Register Dst = I.getOperand(OpIdx).getReg();
MachineInstr *Extract =
emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
if (!Extract)
return false;
}
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectUnmergeValues(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
"unexpected opcode");
// TODO: Handle unmerging into GPRs and from scalars to scalars.
if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
AArch64::FPRRegBankID ||
RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
AArch64::FPRRegBankID) {
LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
"currently unsupported.\n");
return false;
}
// The last operand is the vector source register, and every other operand is
// a register to unpack into.
unsigned NumElts = I.getNumOperands() - 1;
Register SrcReg = I.getOperand(NumElts).getReg();
const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
const LLT WideTy = MRI.getType(SrcReg);
(void)WideTy;
assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
"can only unmerge from vector or s128 types!");
assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
"source register size too small!");
if (!NarrowTy.isScalar())
return selectSplitVectorUnmerge(I, MRI);
MachineIRBuilder MIB(I);
// Choose a lane copy opcode and subregister based off of the size of the
// vector's elements.
unsigned CopyOpc = 0;
unsigned ExtractSubReg = 0;
if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
return false;
// Set up for the lane copies.
MachineBasicBlock &MBB = *I.getParent();
// Stores the registers we'll be copying from.
SmallVector<Register, 4> InsertRegs;
// We'll use the first register twice, so we only need NumElts-1 registers.
unsigned NumInsertRegs = NumElts - 1;
// If our elements fit into exactly 128 bits, then we can copy from the source
// directly. Otherwise, we need to do a bit of setup with some subregister
// inserts.
if (NarrowTy.getSizeInBits() * NumElts == 128) {
InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
} else {
// No. We have to perform subregister inserts. For each insert, create an
// implicit def and a subregister insert, and save the register we create.
for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
MachineInstr &ImpDefMI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
ImpDefReg);
// Now, create the subregister insert from SrcReg.
Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
MachineInstr &InsMI =
*BuildMI(MBB, I, I.getDebugLoc(),
TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
.addUse(ImpDefReg)
.addUse(SrcReg)
.addImm(AArch64::dsub);
constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
// Save the register so that we can copy from it after.
InsertRegs.push_back(InsertReg);
}
}
// Now that we've created any necessary subregister inserts, we can
// create the copies.
//
// Perform the first copy separately as a subregister copy.
Register CopyTo = I.getOperand(0).getReg();
auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
.addReg(InsertRegs[0], 0, ExtractSubReg);
constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
// Now, perform the remaining copies as vector lane copies.
unsigned LaneIdx = 1;
for (Register InsReg : InsertRegs) {
Register CopyTo = I.getOperand(LaneIdx).getReg();
MachineInstr &CopyInst =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
.addUse(InsReg)
.addImm(LaneIdx);
constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
++LaneIdx;
}
// Separately constrain the first copy's destination. Because of the
// limitation in constrainOperandRegClass, we can't guarantee that this will
// actually be constrained. So, do it ourselves using the second operand.
const TargetRegisterClass *RC =
MRI.getRegClassOrNull(I.getOperand(1).getReg());
if (!RC) {
LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
return false;
}
RBI.constrainGenericRegister(CopyTo, *RC, MRI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectConcatVectors(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
"Unexpected opcode");
Register Dst = I.getOperand(0).getReg();
Register Op1 = I.getOperand(1).getReg();
Register Op2 = I.getOperand(2).getReg();
MachineIRBuilder MIRBuilder(I);
MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder);
if (!ConcatMI)
return false;
I.eraseFromParent();
return true;
}
unsigned
AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
MachineFunction &MF) const {
Type *CPTy = CPVal->getType();
Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
MachineConstantPool *MCP = MF.getConstantPool();
return MCP->getConstantPoolIndex(CPVal, Alignment);
}
MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
auto Adrp =
MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
.addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
MachineInstr *LoadMI = nullptr;
switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) {
case 16:
LoadMI =
&*MIRBuilder
.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
.addConstantPoolIndex(CPIdx, 0,
AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
break;
case 8:
LoadMI = &*MIRBuilder
.buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
.addConstantPoolIndex(
CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
break;
default:
LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
<< *CPVal->getType());
return nullptr;
}
constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
return LoadMI;
}
/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
/// size and RB.
static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
unsigned Opc, SubregIdx;
if (RB.getID() == AArch64::GPRRegBankID) {
if (EltSize == 16) {
Opc = AArch64::INSvi16gpr;
SubregIdx = AArch64::ssub;
} else if (EltSize == 32) {
Opc = AArch64::INSvi32gpr;
SubregIdx = AArch64::ssub;
} else if (EltSize == 64) {
Opc = AArch64::INSvi64gpr;
SubregIdx = AArch64::dsub;
} else {
llvm_unreachable("invalid elt size!");
}
} else {
if (EltSize == 8) {
Opc = AArch64::INSvi8lane;
SubregIdx = AArch64::bsub;
} else if (EltSize == 16) {
Opc = AArch64::INSvi16lane;
SubregIdx = AArch64::hsub;
} else if (EltSize == 32) {
Opc = AArch64::INSvi32lane;
SubregIdx = AArch64::ssub;
} else if (EltSize == 64) {
Opc = AArch64::INSvi64lane;
SubregIdx = AArch64::dsub;
} else {
llvm_unreachable("invalid elt size!");
}
}
return std::make_pair(Opc, SubregIdx);
}
MachineInstr *AArch64InstructionSelector::emitInstr(
unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
const ComplexRendererFns &RenderFns) const {
assert(Opcode && "Expected an opcode?");
assert(!isPreISelGenericOpcode(Opcode) &&
"Function should only be used to produce selected instructions!");
auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
if (RenderFns)
for (auto &Fn : *RenderFns)
Fn(MI);
constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
return &*MI;
}
MachineInstr *AArch64InstructionSelector::emitAddSub(
const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
Register Dst, MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
auto Ty = MRI.getType(LHS.getReg());
assert(!Ty.isVector() && "Expected a scalar or pointer?");
unsigned Size = Ty.getSizeInBits();
assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
bool Is32Bit = Size == 32;
// INSTRri form with positive arithmetic immediate.
if (auto Fns = selectArithImmed(RHS))
return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
MIRBuilder, Fns);
// INSTRri form with negative arithmetic immediate.
if (auto Fns = selectNegArithImmed(RHS))
return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
MIRBuilder, Fns);
// INSTRrx form.
if (auto Fns = selectArithExtendedRegister(RHS))
return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
MIRBuilder, Fns);
// INSTRrs form.
if (auto Fns = selectShiftedRegister(RHS))
return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
MIRBuilder, Fns);
return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
MIRBuilder);
}
MachineInstr *
AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
const std::array<std::array<unsigned, 2>, 5> OpcTable{
{{AArch64::ADDXri, AArch64::ADDWri},
{AArch64::ADDXrs, AArch64::ADDWrs},
{AArch64::ADDXrr, AArch64::ADDWrr},
{AArch64::SUBXri, AArch64::SUBWri},
{AArch64::ADDXrx, AArch64::ADDWrx}}};
return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
}
MachineInstr *
AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
const std::array<std::array<unsigned, 2>, 5> OpcTable{
{{AArch64::ADDSXri, AArch64::ADDSWri},
{AArch64::ADDSXrs, AArch64::ADDSWrs},
{AArch64::ADDSXrr, AArch64::ADDSWrr},
{AArch64::SUBSXri, AArch64::SUBSWri},
{AArch64::ADDSXrx, AArch64::ADDSWrx}}};
return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
}
MachineInstr *
AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
const std::array<std::array<unsigned, 2>, 5> OpcTable{
{{AArch64::SUBSXri, AArch64::SUBSWri},
{AArch64::SUBSXrs, AArch64::SUBSWrs},
{AArch64::SUBSXrr, AArch64::SUBSWrr},
{AArch64::ADDSXri, AArch64::ADDSWri},
{AArch64::SUBSXrx, AArch64::SUBSWrx}}};
return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
}
MachineInstr *
AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
}
MachineInstr *
AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
LLT Ty = MRI.getType(LHS.getReg());
unsigned RegSize = Ty.getSizeInBits();
bool Is32Bit = (RegSize == 32);
const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
{AArch64::ANDSXrs, AArch64::ANDSWrs},
{AArch64::ANDSXrr, AArch64::ANDSWrr}};
// ANDS needs a logical immediate for its immediate form. Check if we can
// fold one in.
if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
int64_t Imm = ValAndVReg->Value.getSExtValue();
if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
return &*TstMI;
}
}
if (auto Fns = selectLogicalShiftedRegister(RHS))
return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
}
MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
assert(Predicate.isPredicate() && "Expected predicate?");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
LLT CmpTy = MRI.getType(LHS.getReg());
assert(!CmpTy.isVector() && "Expected scalar or pointer");
unsigned Size = CmpTy.getSizeInBits();
(void)Size;
assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
// Fold the compare into a cmn or tst if possible.
if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
return FoldCmp;
auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
return emitSUBS(Dst, LHS, RHS, MIRBuilder);
}
MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
#ifndef NDEBUG
LLT Ty = MRI.getType(Dst);
assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
"Expected a 32-bit scalar register?");
#endif
const Register ZeroReg = AArch64::WZR;
auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
auto CSet =
MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
.addImm(getInvertedCondCode(CC));
constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
return &*CSet;
};
AArch64CC::CondCode CC1, CC2;
changeFCMPPredToAArch64CC(Pred, CC1, CC2);
if (CC2 == AArch64CC::AL)
return EmitCSet(Dst, CC1);
const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
Register Def1Reg = MRI.createVirtualRegister(RC);
Register Def2Reg = MRI.createVirtualRegister(RC);
EmitCSet(Def1Reg, CC1);
EmitCSet(Def2Reg, CC2);
auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
return &*OrMI;
}
MachineInstr *
AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
MachineIRBuilder &MIRBuilder) const {
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
LLT Ty = MRI.getType(LHS);
if (Ty.isVector())
return nullptr;
unsigned OpSize = Ty.getSizeInBits();
if (OpSize != 32 && OpSize != 64)
return nullptr;
// If this is a compare against +0.0, then we don't have
// to explicitly materialize a constant.
const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
{AArch64::FCMPSri, AArch64::FCMPDri}};
unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
// Partially build the compare. Decide if we need to add a use for the
// third operand based off whether or not we're comparing against 0.0.
auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
if (!ShouldUseImm)
CmpMI.addUse(RHS);
constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
return &*CmpMI;
}
MachineInstr *AArch64InstructionSelector::emitVectorConcat(
Optional<Register> Dst, Register Op1, Register Op2,
MachineIRBuilder &MIRBuilder) const {
// We implement a vector concat by:
// 1. Use scalar_to_vector to insert the lower vector into the larger dest
// 2. Insert the upper vector into the destination's upper element
// TODO: some of this code is common with G_BUILD_VECTOR handling.
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
const LLT Op1Ty = MRI.getType(Op1);
const LLT Op2Ty = MRI.getType(Op2);
if (Op1Ty != Op2Ty) {
LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
return nullptr;
}
assert(Op1Ty.isVector() && "Expected a vector for vector concat");
if (Op1Ty.getSizeInBits() >= 128) {
LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
return nullptr;
}
// At the moment we just support 64 bit vector concats.
if (Op1Ty.getSizeInBits() != 64) {
LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
return nullptr;
}
const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
const TargetRegisterClass *DstRC =
getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
MachineInstr *WidenedOp1 =
emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
MachineInstr *WidenedOp2 =
emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
if (!WidenedOp1 || !WidenedOp2) {
LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
return nullptr;
}
// Now do the insert of the upper element.
unsigned InsertOpc, InsSubRegIdx;
std::tie(InsertOpc, InsSubRegIdx) =
getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
if (!Dst)
Dst = MRI.createVirtualRegister(DstRC);
auto InsElt =
MIRBuilder
.buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
.addImm(1) /* Lane index */
.addUse(WidenedOp2->getOperand(0).getReg())
.addImm(0);
constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
return &*InsElt;
}
MachineInstr *AArch64InstructionSelector::emitFMovForFConstant(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_FCONSTANT &&
"Expected a G_FCONSTANT!");
MachineOperand &ImmOp = I.getOperand(1);
unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
// Only handle 32 and 64 bit defs for now.
if (DefSize != 32 && DefSize != 64)
return nullptr;
// Don't handle null values using FMOV.
if (ImmOp.getFPImm()->isNullValue())
return nullptr;
// Get the immediate representation for the FMOV.
const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF();
int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF)
: AArch64_AM::getFP64Imm(ImmValAPF);
// If this is -1, it means the immediate can't be represented as the requested
// floating point value. Bail.
if (Imm == -1)
return nullptr;
// Update MI to represent the new FMOV instruction, constrain it, and return.
ImmOp.ChangeToImmediate(Imm);
unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi;
I.setDesc(TII.get(MovOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return &I;
}
MachineInstr *
AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
MachineIRBuilder &MIRBuilder) const {
// CSINC increments the result when the predicate is false. Invert it.
const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
auto I =
MIRBuilder
.buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)})
.addImm(InvCC);
constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
return &*I;
}
std::pair<MachineInstr *, AArch64CC::CondCode>
AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
MachineOperand &LHS,
MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
switch (Opcode) {
default:
llvm_unreachable("Unexpected opcode!");
case TargetOpcode::G_SADDO:
return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
case TargetOpcode::G_UADDO:
return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
case TargetOpcode::G_SSUBO:
return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
}
}
bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
MachineIRBuilder MIB(I);
MachineRegisterInfo &MRI = *MIB.getMRI();
// We want to recognize this pattern:
//
// $z = G_FCMP pred, $x, $y
// ...
// $w = G_SELECT $z, $a, $b
//
// Where the value of $z is *only* ever used by the G_SELECT (possibly with
// some copies/truncs in between.)
//
// If we see this, then we can emit something like this:
//
// fcmp $x, $y
// fcsel $w, $a, $b, pred
//
// Rather than emitting both of the rather long sequences in the standard
// G_FCMP/G_SELECT select methods.
// First, check if the condition is defined by a compare.
MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
while (CondDef) {
// We can only fold if all of the defs have one use.
Register CondDefReg = CondDef->getOperand(0).getReg();
if (!MRI.hasOneNonDBGUse(CondDefReg)) {
// Unless it's another select.
for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
if (CondDef == &UI)
continue;
if (UI.getOpcode() != TargetOpcode::G_SELECT)
return false;
}
}
// We can skip over G_TRUNC since the condition is 1-bit.
// Truncating/extending can have no impact on the value.
unsigned Opc = CondDef->getOpcode();
if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
break;
// Can't see past copies from physregs.
if (Opc == TargetOpcode::COPY &&
Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
return false;
CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
}
// Is the condition defined by a compare?
if (!CondDef)
return false;
unsigned CondOpc = CondDef->getOpcode();
if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
return false;
AArch64CC::CondCode CondCode;
if (CondOpc == TargetOpcode::G_ICMP) {
auto Pred =
static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
CondCode = changeICMPPredToAArch64CC(Pred);
emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
CondDef->getOperand(1), MIB);
} else {
// Get the condition code for the select.
auto Pred =
static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
AArch64CC::CondCode CondCode2;
changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
// changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
// instructions to emit the comparison.
// TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
// unnecessary.
if (CondCode2 != AArch64CC::AL)
return false;
if (!emitFPCompare(CondDef->getOperand(2).getReg(),
CondDef->getOperand(3).getReg(), MIB)) {
LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
return false;
}
}
// Emit the select.
emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
I.getOperand(3).getReg(), CondCode, MIB);
I.eraseFromParent();
return true;
}
MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
"Unexpected MachineOperand");
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
// We want to find this sort of thing:
// x = G_SUB 0, y
// G_ICMP z, x
//
// In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
// e.g:
//
// cmn z, y
// Helper lambda to detect the subtract followed by the compare.
// Takes in the def of the LHS or RHS, and checks if it's a subtract from 0.
auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) {
if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB)
return false;
// Need to make sure NZCV is the same at the end of the transformation.
if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
return false;
// We want to match against SUBs.
if (DefMI->getOpcode() != TargetOpcode::G_SUB)
return false;
// Make sure that we're getting
// x = G_SUB 0, y
auto ValAndVReg =
getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI);
if (!ValAndVReg || ValAndVReg->Value != 0)
return false;
// This can safely be represented as a CMN.
return true;
};
// Check if the RHS or LHS of the G_ICMP is defined by a SUB
MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P);
// Given this:
//
// x = G_SUB 0, y
// G_ICMP x, z
//
// Produce this:
//
// cmn y, z
if (IsCMN(LHSDef, CC))
return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
// Same idea here, but with the RHS of the compare instead:
//
// Given this:
//
// x = G_SUB 0, y
// G_ICMP z, x
//
// Produce this:
//
// cmn z, y
if (IsCMN(RHSDef, CC))
return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
// Given this:
//
// z = G_AND x, y
// G_ICMP z, 0
//
// Produce this if the compare is signed:
//
// tst x, y
if (!CmpInst::isUnsigned(P) && LHSDef &&
LHSDef->getOpcode() == TargetOpcode::G_AND) {
// Make sure that the RHS is 0.
auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
if (!ValAndVReg || ValAndVReg->Value != 0)
return nullptr;
return emitTST(LHSDef->getOperand(1),
LHSDef->getOperand(2), MIRBuilder);
}
return nullptr;
}
bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Register Src1Reg = I.getOperand(1).getReg();
const LLT Src1Ty = MRI.getType(Src1Reg);
Register Src2Reg = I.getOperand(2).getReg();
const LLT Src2Ty = MRI.getType(Src2Reg);
ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
LLVMContext &Ctx = MF.getFunction().getContext();
// G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
// it's originated from a <1 x T> type. Those should have been lowered into
// G_BUILD_VECTOR earlier.
if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
return false;
}
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
SmallVector<Constant *, 64> CstIdxs;
for (int Val : Mask) {
// For now, any undef indexes we'll just assume to be 0. This should be
// optimized in future, e.g. to select DUP etc.
Val = Val < 0 ? 0 : Val;
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
}
}
MachineIRBuilder MIRBuilder(I);
// Use a constant pool to load the index vector for TBL.
Constant *CPVal = ConstantVector::get(CstIdxs);
MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
if (!IndexLoad) {
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
return false;
}
if (DstTy.getSizeInBits() != 128) {
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
// This case can be done with TBL1.
MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder);
if (!Concat) {
LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
return false;
}
// The constant pool load will be 64 bits, so need to convert to FPR128 reg.
IndexLoad =
emitScalarToVector(64, &AArch64::FPR128RegClass,
IndexLoad->getOperand(0).getReg(), MIRBuilder);
auto TBL1 = MIRBuilder.buildInstr(
AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
{Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
auto Copy =
MIRBuilder
.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
.addReg(TBL1.getReg(0), 0, AArch64::dsub);
RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
I.eraseFromParent();
return true;
}
// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
// Q registers for regalloc.
auto RegSeq = MIRBuilder
.buildInstr(TargetOpcode::REG_SEQUENCE,
{&AArch64::QQRegClass}, {Src1Reg})
.addImm(AArch64::qsub0)
.addUse(Src2Reg)
.addImm(AArch64::qsub1);
auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
{RegSeq, IndexLoad->getOperand(0)});
constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
MachineInstr *AArch64InstructionSelector::emitLaneInsert(
Optional<Register> DstReg, Register SrcReg, Register EltReg,
unsigned LaneIdx, const RegisterBank &RB,
MachineIRBuilder &MIRBuilder) const {
MachineInstr *InsElt = nullptr;
const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
// Create a register to define with the insert if one wasn't passed in.
if (!DstReg)
DstReg = MRI.createVirtualRegister(DstRC);
unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
if (RB.getID() == AArch64::FPRRegBankID) {
auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
.addImm(LaneIdx)
.addUse(InsSub->getOperand(0).getReg())
.addImm(0);
} else {
InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
.addImm(LaneIdx)
.addUse(EltReg);
}
constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
return InsElt;
}
bool AArch64InstructionSelector::selectInsertElt(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
// Get information on the destination.
Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
unsigned VecSize = DstTy.getSizeInBits();
// Get information on the element we want to insert into the destination.
Register EltReg = I.getOperand(2).getReg();
const LLT EltTy = MRI.getType(EltReg);
unsigned EltSize = EltTy.getSizeInBits();
if (EltSize < 16 || EltSize > 64)
return false; // Don't support all element types yet.
// Find the definition of the index. Bail out if it's not defined by a
// G_CONSTANT.
Register IdxReg = I.getOperand(3).getReg();
auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
if (!VRegAndVal)
return false;
unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
// Perform the lane insert.
Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
MachineIRBuilder MIRBuilder(I);
if (VecSize < 128) {
// If the vector we're inserting into is smaller than 128 bits, widen it
// to 128 to do the insert.
MachineInstr *ScalarToVec = emitScalarToVector(
VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder);
if (!ScalarToVec)
return false;
SrcReg = ScalarToVec->getOperand(0).getReg();
}
// Create an insert into a new FPR128 register.
// Note that if our vector is already 128 bits, we end up emitting an extra
// register.
MachineInstr *InsMI =
emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder);
if (VecSize < 128) {
// If we had to widen to perform the insert, then we have to demote back to
// the original size to get the result we want.
Register DemoteVec = InsMI->getOperand(0).getReg();
const TargetRegisterClass *RC =
getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
return false;
}
unsigned SubReg = 0;
if (!getSubRegForClass(RC, TRI, SubReg))
return false;
if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
<< "\n");
return false;
}
MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
.addReg(DemoteVec, 0, SubReg);
RBI.constrainGenericRegister(DstReg, *RC, MRI);
} else {
// No widening needed.
InsMI->getOperand(0).setReg(DstReg);
constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
}
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::tryOptConstantBuildVec(
MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
unsigned DstSize = DstTy.getSizeInBits();
assert(DstSize <= 128 && "Unexpected build_vec type!");
if (DstSize < 32)
return false;
// Check if we're building a constant vector, in which case we want to
// generate a constant pool load instead of a vector insert sequence.
SmallVector<Constant *, 16> Csts;
for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
// Try to find G_CONSTANT or G_FCONSTANT
auto *OpMI =
getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
if (OpMI)
Csts.emplace_back(
const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
I.getOperand(Idx).getReg(), MRI)))
Csts.emplace_back(
const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
else
return false;
}
Constant *CV = ConstantVector::get(Csts);
MachineIRBuilder MIB(I);
if (CV->isNullValue()) {
// Until the importer can support immAllZerosV in pattern leaf nodes,
// select a zero move manually here.
Register DstReg = I.getOperand(0).getReg();
if (DstSize == 128) {
auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
} else if (DstSize == 64) {
auto Mov =
MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
.addImm(0);
MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
.addReg(Mov.getReg(0), 0, AArch64::dsub);
I.eraseFromParent();
return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI);
}
}
auto *CPLoad = emitLoadFromConstantPool(CV, MIB);
if (!CPLoad) {
LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector");
return false;
}
MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0));
RBI.constrainGenericRegister(I.getOperand(0).getReg(),
*MRI.getRegClass(CPLoad->getOperand(0).getReg()),
MRI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectBuildVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
// Until we port more of the optimized selections, for now just use a vector
// insert sequence.
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
unsigned EltSize = EltTy.getSizeInBits();
if (tryOptConstantBuildVec(I, DstTy, MRI))
return true;
if (EltSize < 16 || EltSize > 64)
return false; // Don't support all element types yet.
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
MachineIRBuilder MIRBuilder(I);
const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
MachineInstr *ScalarToVec =
emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
I.getOperand(1).getReg(), MIRBuilder);
if (!ScalarToVec)
return false;
Register DstVec = ScalarToVec->getOperand(0).getReg();
unsigned DstSize = DstTy.getSizeInBits();
// Keep track of the last MI we inserted. Later on, we might be able to save
// a copy using it.
MachineInstr *PrevMI = nullptr;
for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
// Note that if we don't do a subregister copy, we can end up making an
// extra register.
PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
MIRBuilder);
DstVec = PrevMI->getOperand(0).getReg();
}
// If DstTy's size in bits is less than 128, then emit a subregister copy
// from DstVec to the last register we've defined.
if (DstSize < 128) {
// Force this to be FPR using the destination vector.
const TargetRegisterClass *RC =
getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
if (!RC)
return false;
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
return false;
}
unsigned SubReg = 0;
if (!getSubRegForClass(RC, TRI, SubReg))
return false;
if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
<< "\n");
return false;
}
Register Reg = MRI.createVirtualRegister(RC);
Register DstReg = I.getOperand(0).getReg();
MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
.addReg(DstVec, 0, SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(Reg);
RBI.constrainGenericRegister(DstReg, *RC, MRI);
} else {
// We don't need a subregister copy. Save a copy by re-using the
// destination register on the final insert.
assert(PrevMI && "PrevMI was null?");
PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
}
I.eraseFromParent();
return true;
}
/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the
/// ID if it exists, and 0 otherwise.
static unsigned findIntrinsicID(MachineInstr &I) {
auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) {
return Op.isIntrinsicID();
});
if (IntrinOp == I.operands_end())
return 0;
return IntrinOp->getIntrinsicID();
}
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// Find the intrinsic ID.
unsigned IntrinID = findIntrinsicID(I);
if (!IntrinID)
return false;
MachineIRBuilder MIRBuilder(I);
// Select the instruction.
switch (IntrinID) {
default:
return false;
case Intrinsic::trap:
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1);
break;
case Intrinsic::debugtrap:
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
break;
case Intrinsic::ubsantrap:
MIRBuilder.buildInstr(AArch64::BRK, {}, {})
.addImm(I.getOperand(1).getImm() | ('U' << 8));
break;
}
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
MachineRegisterInfo &MRI) {
unsigned IntrinID = findIntrinsicID(I);
if (!IntrinID)
return false;
MachineIRBuilder MIRBuilder(I);
switch (IntrinID) {
default:
break;
case Intrinsic::aarch64_crypto_sha1h: {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(2).getReg();
// FIXME: Should this be an assert?
if (MRI.getType(DstReg).getSizeInBits() != 32 ||
MRI.getType(SrcReg).getSizeInBits() != 32)
return false;
// The operation has to happen on FPRs. Set up some new FPR registers for
// the source and destination if they are on GPRs.
if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)});
// Make sure the copy ends up getting constrained properly.
RBI.constrainGenericRegister(I.getOperand(2).getReg(),
AArch64::GPR32RegClass, MRI);
}
if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
// Actually insert the instruction.
auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
// Did we create a new register for the destination?
if (DstReg != I.getOperand(0).getReg()) {
// Yep. Copy the result of the instruction back into the original
// destination.
MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg});
RBI.constrainGenericRegister(I.getOperand(0).getReg(),
AArch64::GPR32RegClass, MRI);
}
I.eraseFromParent();
return true;
}
case Intrinsic::frameaddress:
case Intrinsic::returnaddress: {
MachineFunction &MF = *I.getParent()->getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Depth = I.getOperand(2).getImm();
Register DstReg = I.getOperand(0).getReg();
RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
if (!MFReturnAddr) {
// Insert the copy from LR/X30 into the entry block, before it can be
// clobbered by anything.
MFI.setReturnAddressIsTaken(true);
MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
AArch64::GPR64RegClass);
}
if (STI.hasV8_3aOps()) {
MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
} else {
MIRBuilder.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
MIRBuilder.buildInstr(AArch64::XPACLRI);
MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
}
I.eraseFromParent();
return true;
}
MFI.setFrameAddressIsTaken(true);
Register FrameAddr(AArch64::FP);
while (Depth--) {
Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
auto Ldr =
MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr})
.addImm(0);
constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
FrameAddr = NextFrame;
}
if (IntrinID == Intrinsic::frameaddress)
MIRBuilder.buildCopy({DstReg}, {FrameAddr});
else {
MFI.setReturnAddressIsTaken(true);
if (STI.hasV8_3aOps()) {
Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
MIRBuilder.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
} else {
MIRBuilder.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}).addImm(1);
MIRBuilder.buildInstr(AArch64::XPACLRI);
MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
}
}
I.eraseFromParent();
return true;
}
}
return false;
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 31)
return None;
uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 31)
return None;
uint64_t Enc = 31 - *MaybeImmed;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 63)
return None;
uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 63)
return None;
uint64_t Enc = 63 - *MaybeImmed;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
/// Helper to select an immediate value that can be represented as a 12-bit
/// value shifted left by either 0 or 12. If it is possible to do so, return
/// the immediate and shift value. If not, return None.
///
/// Used by selectArithImmed and selectNegArithImmed.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::select12BitValueWithLeftShift(
uint64_t Immed) const {
unsigned ShiftAmt;
if (Immed >> 12 == 0) {
ShiftAmt = 0;
} else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
ShiftAmt = 12;
Immed = Immed >> 12;
} else
return None;
unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
}};
}
/// SelectArithImmed - Select an immediate value that can be represented as
/// a 12-bit value shifted left by either 0 or 12. If so, return true with
/// Val set to the 12-bit value and Shift set to the shifter operand.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
// This function is called from the addsub_shifted_imm ComplexPattern,
// which lists [imm] as the list of opcode it's interested in, however
// we still need to check whether the operand is actually an immediate
// here because the ComplexPattern opcode list is only used in
// root-level opcode matching.
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None)
return None;
return select12BitValueWithLeftShift(*MaybeImmed);
}
/// SelectNegArithImmed - As above, but negates the value before trying to
/// select it.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
// We need a register here, because we need to know if we have a 64 or 32
// bit immediate.
if (!Root.isReg())
return None;
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None)
return None;
uint64_t Immed = *MaybeImmed;
// This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
// have the opposite effect on the C flag, so this pattern mustn't match under
// those circumstances.
if (Immed == 0)
return None;
// Check if we're dealing with a 32-bit type on the root or a 64-bit type on
// the root.
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
Immed = ~((uint32_t)Immed) + 1;
else
Immed = ~Immed + 1ULL;
if (Immed & 0xFFFFFFFFFF000000ULL)
return None;
Immed &= 0xFFFFFFULL;
return select12BitValueWithLeftShift(Immed);
}
/// Return true if it is worth folding MI into an extended register. That is,
/// if it's safe to pull it into the addressing mode of a load or store as a
/// shift.
bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
MachineInstr &MI, const MachineRegisterInfo &MRI) const {
// Always fold if there is one use, or if we're optimizing for size.
Register DefReg = MI.getOperand(0).getReg();
if (MRI.hasOneNonDBGUse(DefReg) ||
MI.getParent()->getParent()->getFunction().hasMinSize())
return true;
// It's better to avoid folding and recomputing shifts when we don't have a
// fastpath.
if (!STI.hasLSLFast())
return false;
// We have a fastpath, so folding a shift in and potentially computing it
// many times may be beneficial. Check if this is only used in memory ops.
// If it is, then we should fold.
return all_of(MRI.use_nodbg_instructions(DefReg),
[](MachineInstr &Use) { return Use.mayLoadOrStore(); });
}
static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
switch (Type) {
case AArch64_AM::SXTB:
case AArch64_AM::SXTH:
case AArch64_AM::SXTW:
return true;
default:
return false;
}
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectExtendedSHL(
MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
unsigned SizeInBytes, bool WantsExt) const {
assert(Base.isReg() && "Expected base to be a register operand");
assert(Offset.isReg() && "Expected offset to be a register operand");
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
if (!OffsetInst)
return None;
unsigned OffsetOpc = OffsetInst->getOpcode();
bool LookedThroughZExt = false;
if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
// Try to look through a ZEXT.
if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
return None;
OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
OffsetOpc = OffsetInst->getOpcode();
LookedThroughZExt = true;
if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
return None;
}
// Make sure that the memory op is a valid size.
int64_t LegalShiftVal = Log2_32(SizeInBytes);
if (LegalShiftVal == 0)
return None;
if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
return None;
// Now, try to find the specific G_CONSTANT. Start by assuming that the
// register we will offset is the LHS, and the register containing the
// constant is the RHS.
Register OffsetReg = OffsetInst->getOperand(1).getReg();
Register ConstantReg = OffsetInst->getOperand(2).getReg();
auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
if (!ValAndVReg) {
// We didn't get a constant on the RHS. If the opcode is a shift, then
// we're done.
if (OffsetOpc == TargetOpcode::G_SHL)
return None;
// If we have a G_MUL, we can use either register. Try looking at the RHS.
std::swap(OffsetReg, ConstantReg);
ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
if (!ValAndVReg)
return None;
}
// The value must fit into 3 bits, and must be positive. Make sure that is
// true.
int64_t ImmVal = ValAndVReg->Value.getSExtValue();
// Since we're going to pull this into a shift, the constant value must be
// a power of 2. If we got a multiply, then we need to check this.
if (OffsetOpc == TargetOpcode::G_MUL) {
if (!isPowerOf2_32(ImmVal))
return None;
// Got a power of 2. So, the amount we'll shift is the log base-2 of that.
ImmVal = Log2_32(ImmVal);
}
if ((ImmVal & 0x7) != ImmVal)
return None;
// We are only allowed to shift by LegalShiftVal. This shift value is built
// into the instruction, so we can't just use whatever we want.
if (ImmVal != LegalShiftVal)
return None;
unsigned SignExtend = 0;
if (WantsExt) {
// Check if the offset is defined by an extend, unless we looked through a
// G_ZEXT earlier.
if (!LookedThroughZExt) {
MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
if (Ext == AArch64_AM::InvalidShiftExtend)
return None;
SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
// We only support SXTW for signed extension here.
if (SignExtend && Ext != AArch64_AM::SXTW)
return None;
OffsetReg = ExtInst->getOperand(1).getReg();
}
// Need a 32-bit wide register here.
MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
}
// We can use the LHS of the GEP as the base, and the LHS of the shift as an
// offset. Signify that we are shifting by setting the shift flag to 1.
return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
[=](MachineInstrBuilder &MIB) {
// Need to add both immediates here to make sure that they are both
// added to the instruction.
MIB.addImm(SignExtend);
MIB.addImm(1);
}}};
}
/// This is used for computing addresses like this:
///
/// ldr x1, [x2, x3, lsl #3]
///
/// Where x2 is the base register, and x3 is an offset register. The shift-left
/// is a constant value specific to this load instruction. That is, we'll never
/// see anything other than a 3 here (which corresponds to the size of the
/// element being loaded.)
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
MachineOperand &Root, unsigned SizeInBytes) const {
if (!Root.isReg())
return None;
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
// We want to find something like this:
//
// val = G_CONSTANT LegalShiftVal
// shift = G_SHL off_reg val
// ptr = G_PTR_ADD base_reg shift
// x = G_LOAD ptr
//
// And fold it into this addressing mode:
//
// ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
// Check if we can find the G_PTR_ADD.
MachineInstr *PtrAdd =
getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
return None;
// Now, try to match an opcode which will match our specific offset.
// We want a G_SHL or a G_MUL.
MachineInstr *OffsetInst =
getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
return selectExtendedSHL(Root, PtrAdd->getOperand(1),
OffsetInst->getOperand(0), SizeInBytes,
/*WantsExt=*/false);
}
/// This is used for computing addresses like this:
///
/// ldr x1, [x2, x3]
///
/// Where x2 is the base register, and x3 is an offset register.
///
/// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
/// this will do so. Otherwise, it will return None.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeRegisterOffset(
MachineOperand &Root) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
// We need a GEP.
MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
return None;
// If this is used more than once, let's not bother folding.
// TODO: Check if they are memory ops. If they are, then we can still fold
// without having to recompute anything.
if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
return None;
// Base is the GEP's LHS, offset is its RHS.
return {{[=](MachineInstrBuilder &MIB) {
MIB.addUse(Gep->getOperand(1).getReg());
},
[=](MachineInstrBuilder &MIB) {
MIB.addUse(Gep->getOperand(2).getReg());
},
[=](MachineInstrBuilder &MIB) {
// Need to add both immediates here to make sure that they are both
// added to the instruction.
MIB.addImm(0);
MIB.addImm(0);
}}};
}
/// This is intended to be equivalent to selectAddrModeXRO in
/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
unsigned SizeInBytes) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
if (!Root.isReg())
return None;
MachineInstr *PtrAdd =
getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
if (!PtrAdd)
return None;
// Check for an immediates which cannot be encoded in the [base + imm]
// addressing mode, and can't be encoded in an add/sub. If this happens, we'll
// end up with code like:
//
// mov x0, wide
// add x1 base, x0
// ldr x2, [x1, x0]
//
// In this situation, we can use the [base, xreg] addressing mode to save an
// add/sub:
//
// mov x0, wide
// ldr x2, [base, x0]
auto ValAndVReg =
getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
if (ValAndVReg) {
unsigned Scale = Log2_32(SizeInBytes);
int64_t ImmOff = ValAndVReg->Value.getSExtValue();
// Skip immediates that can be selected in the load/store addresing
// mode.
if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
ImmOff < (0x1000 << Scale))
return None;
// Helper lambda to decide whether or not it is preferable to emit an add.
auto isPreferredADD = [](int64_t ImmOff) {
// Constants in [0x0, 0xfff] can be encoded in an add.
if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
return true;
// Can it be encoded in an add lsl #12?
if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
return false;
// It can be encoded in an add lsl #12, but we may not want to. If it is
// possible to select this as a single movz, then prefer that. A single
// movz is faster than an add with a shift.
return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
(ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
};
// If the immediate can be encoded in a single add/sub, then bail out.
if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
return None;
}
// Try to fold shifts into the addressing mode.
auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
if (AddrModeFns)
return AddrModeFns;
// If that doesn't work, see if it's possible to fold in registers from
// a GEP.
return selectAddrModeRegisterOffset(Root);
}
/// This is used for computing addresses like this:
///
/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
///
/// Where we have a 64-bit base register, a 32-bit offset register, and an
/// extend (which may or may not be signed).
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
unsigned SizeInBytes) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
MachineInstr *PtrAdd =
getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
return None;
MachineOperand &LHS = PtrAdd->getOperand(1);
MachineOperand &RHS = PtrAdd->getOperand(2);
MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
// The first case is the same as selectAddrModeXRO, except we need an extend.
// In this case, we try to find a shift and extend, and fold them into the
// addressing mode.
//
// E.g.
//
// off_reg = G_Z/S/ANYEXT ext_reg
// val = G_CONSTANT LegalShiftVal
// shift = G_SHL off_reg val
// ptr = G_PTR_ADD base_reg shift
// x = G_LOAD ptr
//
// In this case we can get a load like this:
//
// ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
SizeInBytes, /*WantsExt=*/true);
if (ExtendedShl)
return ExtendedShl;
// There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
//
// e.g.
// ldr something, [base_reg, ext_reg, sxtw]
if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
return None;
// Check if this is an extend. We'll get an extend type if it is.
AArch64_AM::ShiftExtendType Ext =
getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
if (Ext == AArch64_AM::InvalidShiftExtend)
return None;
// Need a 32-bit wide register.
MachineIRBuilder MIB(*PtrAdd);
Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
AArch64::GPR32RegClass, MIB);
unsigned SignExtend = Ext == AArch64_AM::SXTW;
// Base is LHS, offset is ExtReg.
return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
[=](MachineInstrBuilder &MIB) {
MIB.addImm(SignExtend);
MIB.addImm(0);
}}};
}
/// Select a "register plus unscaled signed 9-bit immediate" address. This
/// should only match when there is an offset that is not valid for a scaled
/// immediate addressing mode. The "Size" argument is the size in bytes of the
/// memory reference, which is needed here to know what is valid for a scaled
/// immediate.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
unsigned Size) const {
MachineRegisterInfo &MRI =
Root.getParent()->getParent()->getParent()->getRegInfo();
if (!Root.isReg())
return None;
if (!isBaseWithConstantOffset(Root, MRI))
return None;
MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
if (!RootDef)
return None;
MachineOperand &OffImm = RootDef->getOperand(2);
if (!OffImm.isReg())
return None;
MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
return None;
int64_t RHSC;
MachineOperand &RHSOp1 = RHS->getOperand(1);
if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
return None;
RHSC = RHSOp1.getCImm()->getSExtValue();
// If the offset is valid as a scaled immediate, don't match here.
if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
return None;
if (RHSC >= -256 && RHSC < 256) {
MachineOperand &Base = RootDef->getOperand(1);
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Base); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
}};
}
return None;
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
unsigned Size,
MachineRegisterInfo &MRI) const {
if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
return None;
MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
if (Adrp.getOpcode() != AArch64::ADRP)
return None;
// TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
// TODO: Need to check GV's offset % size if doing offset folding into globals.
assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global");
auto GV = Adrp.getOperand(1).getGlobal();
if (GV->isThreadLocal())
return None;
auto &MF = *RootDef.getParent()->getParent();
if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
return None;
unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
MachineIRBuilder MIRBuilder(RootDef);
Register AdrpReg = Adrp.getOperand(0).getReg();
return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
[=](MachineInstrBuilder &MIB) {
MIB.addGlobalAddress(GV, /* Offset */ 0,
OpFlags | AArch64II::MO_PAGEOFF |
AArch64II::MO_NC);
}}};
}
/// Select a "register plus scaled unsigned 12-bit immediate" address. The
/// "Size" argument is the size in bytes of the memory reference, which
/// determines the scale.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
unsigned Size) const {
MachineFunction &MF = *Root.getParent()->getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
if (!Root.isReg())
return None;
MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
if (!RootDef)
return None;
if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
}};
}
CodeModel::Model CM = MF.getTarget().getCodeModel();
// Check if we can fold in the ADD of small code model ADRP + ADD address.
if (CM == CodeModel::Small) {
auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
if (OpFns)
return OpFns;
}
if (isBaseWithConstantOffset(Root, MRI)) {
MachineOperand &LHS = RootDef->getOperand(1);
MachineOperand &RHS = RootDef->getOperand(2);
MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
if (LHSDef && RHSDef) {
int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
unsigned Scale = Log2_32(Size);
if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
}};
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
}};
}
}
}
// Before falling back to our general case, check if the unscaled
// instructions can handle this. If so, that's preferable.
if (selectAddrModeUnscaled(Root, Size).hasValue())
return None;
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Root); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
}};
}
/// Given a shift instruction, return the correct shift type for that
/// instruction.
static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
// TODO: Handle AArch64_AM::ROR
switch (MI.getOpcode()) {
default:
return AArch64_AM::InvalidShiftExtend;
case TargetOpcode::G_SHL:
return AArch64_AM::LSL;
case TargetOpcode::G_LSHR:
return AArch64_AM::LSR;
case TargetOpcode::G_ASHR:
return AArch64_AM::ASR;
}
}
/// Select a "shifted register" operand. If the value is not shifted, set the
/// shift operand to a default value of "lsl 0".
///
/// TODO: Allow shifted register to be rotated in logical instructions.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
if (!Root.isReg())
return None;
MachineRegisterInfo &MRI =
Root.getParent()->getParent()->getParent()->getRegInfo();
// Check if the operand is defined by an instruction which corresponds to
// a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
//
// TODO: Handle AArch64_AM::ROR for logical instructions.
MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
if (!ShiftInst)
return None;
AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
if (ShType == AArch64_AM::InvalidShiftExtend)
return None;
if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
return None;
// Need an immediate on the RHS.
MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
auto Immed = getImmedFromMO(ShiftRHS);
if (!Immed)
return None;
// We have something that we can fold. Fold in the shift's LHS and RHS into
// the instruction.
MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
Register ShiftReg = ShiftLHS.getReg();
unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
unsigned Val = *Immed & (NumBits - 1);
unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
}
AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
unsigned Opc = MI.getOpcode();
// Handle explicit extend instructions first.
if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
unsigned Size;
if (Opc == TargetOpcode::G_SEXT)
Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
else
Size = MI.getOperand(2).getImm();
assert(Size != 64 && "Extend from 64 bits?");
switch (Size) {
case 8:
return AArch64_AM::SXTB;
case 16:
return AArch64_AM::SXTH;
case 32:
return AArch64_AM::SXTW;
default:
return AArch64_AM::InvalidShiftExtend;
}
}
if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
assert(Size != 64 && "Extend from 64 bits?");
switch (Size) {
case 8:
return AArch64_AM::UXTB;
case 16:
return AArch64_AM::UXTH;
case 32:
return AArch64_AM::UXTW;
default:
return AArch64_AM::InvalidShiftExtend;
}
}
// Don't have an explicit extend. Try to handle a G_AND with a constant mask
// on the RHS.
if (Opc != TargetOpcode::G_AND)
return AArch64_AM::InvalidShiftExtend;
Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
if (!MaybeAndMask)
return AArch64_AM::InvalidShiftExtend;
uint64_t AndMask = *MaybeAndMask;
switch (AndMask) {
default:
return AArch64_AM::InvalidShiftExtend;
case 0xFF:
return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
case 0xFFFF:
return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
case 0xFFFFFFFF:
return AArch64_AM::UXTW;
}
}
Register AArch64InstructionSelector::moveScalarRegClass(
Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
MachineRegisterInfo &MRI = *MIB.getMRI();
auto Ty = MRI.getType(Reg);
assert(!Ty.isVector() && "Expected scalars only!");
if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
return Reg;
// Create a copy and immediately select it.
// FIXME: We should have an emitCopy function?
auto Copy = MIB.buildCopy({&RC}, {Reg});
selectCopy(*Copy, TII, MRI, TRI, RBI);
return Copy.getReg(0);
}
/// Select an "extended register" operand. This operand folds in an extend
/// followed by an optional left shift.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectArithExtendedRegister(
MachineOperand &Root) const {
if (!Root.isReg())
return None;
MachineRegisterInfo &MRI =
Root.getParent()->getParent()->getParent()->getRegInfo();
uint64_t ShiftVal = 0;
Register ExtReg;
AArch64_AM::ShiftExtendType Ext;
MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
if (!RootDef)
return None;
if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
return None;
// Check if we can fold a shift and an extend.
if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
// Look for a constant on the RHS of the shift.
MachineOperand &RHS = RootDef->getOperand(2);
Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
if (!MaybeShiftVal)
return None;
ShiftVal = *MaybeShiftVal;
if (ShiftVal > 4)
return None;
// Look for a valid extend instruction on the LHS of the shift.
MachineOperand &LHS = RootDef->getOperand(1);
MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
if (!ExtDef)
return None;
Ext = getExtendTypeForInst(*ExtDef, MRI);
if (Ext == AArch64_AM::InvalidShiftExtend)
return None;
ExtReg = ExtDef->getOperand(1).getReg();
} else {
// Didn't get a shift. Try just folding an extend.
Ext = getExtendTypeForInst(*RootDef, MRI);
if (Ext == AArch64_AM::InvalidShiftExtend)
return None;
ExtReg = RootDef->getOperand(1).getReg();
// If we have a 32 bit instruction which zeroes out the high half of a
// register, we get an implicit zero extend for free. Check if we have one.
// FIXME: We actually emit the extend right now even though we don't have
// to.
if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
if (ExtInst && isDef32(*ExtInst))
return None;
}
}
// We require a GPR32 here. Narrow the ExtReg if needed using a subregister
// copy.
MachineIRBuilder MIB(*RootDef);
ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
[=](MachineInstrBuilder &MIB) {
MIB.addImm(getArithExtendImm(Ext, ShiftVal));
}}};
}
void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
TableGen/GlobalISel: Add way for SDNodeXForm to work on timm The current implementation assumes there is an instruction associated with the transform, but this is not the case for timm/TargetConstant/immarg values. These transforms should directly operate on a specific MachineOperand in the source instruction. TableGen would assert if you attempted to define an equivalent GISDNodeXFormEquiv using timm when it failed to find the instruction matcher. Specially recognize SDNodeXForms on timm, and pass the operand index to the render function. Ideally this would be a separate render function type that looks like void renderFoo(MachineInstrBuilder, const MachineOperand&), but this proved to be somewhat mechanically painful. Add an optional operand index which will only be passed if the transform should only look at the one source operand. Theoretically it would also be possible to only ever pass the MachineOperand, and the existing renderers would check the parent. I think that would be somewhat ugly for the standard usage which may want to inspect other operands, and I also think MachineOperand should eventually not carry a pointer to the parent instruction. Use it in one sample pattern. This isn't a great example, since the transform exists to satisfy DAG type constraints. This could also be avoided by just changing the MachineInstr's arbitrary choice of operand type from i16 to i32. Other patterns have nontrivial uses, but this serves as the simplest example. One flaw this still has is if you try to use an SDNodeXForm defined for imm, but the source pattern uses timm, you still see the "Failed to lookup instruction" assert. However, there is now a way to avoid it.
2020-01-09 01:53:15 +08:00
const MachineInstr &MI,
int OpIdx) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
TableGen/GlobalISel: Add way for SDNodeXForm to work on timm The current implementation assumes there is an instruction associated with the transform, but this is not the case for timm/TargetConstant/immarg values. These transforms should directly operate on a specific MachineOperand in the source instruction. TableGen would assert if you attempted to define an equivalent GISDNodeXFormEquiv using timm when it failed to find the instruction matcher. Specially recognize SDNodeXForms on timm, and pass the operand index to the render function. Ideally this would be a separate render function type that looks like void renderFoo(MachineInstrBuilder, const MachineOperand&), but this proved to be somewhat mechanically painful. Add an optional operand index which will only be passed if the transform should only look at the one source operand. Theoretically it would also be possible to only ever pass the MachineOperand, and the existing renderers would check the parent. I think that would be somewhat ugly for the standard usage which may want to inspect other operands, and I also think MachineOperand should eventually not carry a pointer to the parent instruction. Use it in one sample pattern. This isn't a great example, since the transform exists to satisfy DAG type constraints. This could also be avoided by just changing the MachineInstr's arbitrary choice of operand type from i16 to i32. Other patterns have nontrivial uses, but this serves as the simplest example. One flaw this still has is if you try to use an SDNodeXForm defined for imm, but the source pattern uses timm, you still see the "Failed to lookup instruction" assert. However, there is now a way to avoid it.
2020-01-09 01:53:15 +08:00
assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
"Expected G_CONSTANT");
Optional<int64_t> CstVal =
getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
assert(CstVal && "Expected constant value");
MIB.addImm(CstVal.getValue());
}
void AArch64InstructionSelector::renderLogicalImm32(
MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
TableGen/GlobalISel: Add way for SDNodeXForm to work on timm The current implementation assumes there is an instruction associated with the transform, but this is not the case for timm/TargetConstant/immarg values. These transforms should directly operate on a specific MachineOperand in the source instruction. TableGen would assert if you attempted to define an equivalent GISDNodeXFormEquiv using timm when it failed to find the instruction matcher. Specially recognize SDNodeXForms on timm, and pass the operand index to the render function. Ideally this would be a separate render function type that looks like void renderFoo(MachineInstrBuilder, const MachineOperand&), but this proved to be somewhat mechanically painful. Add an optional operand index which will only be passed if the transform should only look at the one source operand. Theoretically it would also be possible to only ever pass the MachineOperand, and the existing renderers would check the parent. I think that would be somewhat ugly for the standard usage which may want to inspect other operands, and I also think MachineOperand should eventually not carry a pointer to the parent instruction. Use it in one sample pattern. This isn't a great example, since the transform exists to satisfy DAG type constraints. This could also be avoided by just changing the MachineInstr's arbitrary choice of operand type from i16 to i32. Other patterns have nontrivial uses, but this serves as the simplest example. One flaw this still has is if you try to use an SDNodeXForm defined for imm, but the source pattern uses timm, you still see the "Failed to lookup instruction" assert. However, there is now a way to avoid it.
2020-01-09 01:53:15 +08:00
assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
"Expected G_CONSTANT");
uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
MIB.addImm(Enc);
}
void AArch64InstructionSelector::renderLogicalImm64(
MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
TableGen/GlobalISel: Add way for SDNodeXForm to work on timm The current implementation assumes there is an instruction associated with the transform, but this is not the case for timm/TargetConstant/immarg values. These transforms should directly operate on a specific MachineOperand in the source instruction. TableGen would assert if you attempted to define an equivalent GISDNodeXFormEquiv using timm when it failed to find the instruction matcher. Specially recognize SDNodeXForms on timm, and pass the operand index to the render function. Ideally this would be a separate render function type that looks like void renderFoo(MachineInstrBuilder, const MachineOperand&), but this proved to be somewhat mechanically painful. Add an optional operand index which will only be passed if the transform should only look at the one source operand. Theoretically it would also be possible to only ever pass the MachineOperand, and the existing renderers would check the parent. I think that would be somewhat ugly for the standard usage which may want to inspect other operands, and I also think MachineOperand should eventually not carry a pointer to the parent instruction. Use it in one sample pattern. This isn't a great example, since the transform exists to satisfy DAG type constraints. This could also be avoided by just changing the MachineInstr's arbitrary choice of operand type from i16 to i32. Other patterns have nontrivial uses, but this serves as the simplest example. One flaw this still has is if you try to use an SDNodeXForm defined for imm, but the source pattern uses timm, you still see the "Failed to lookup instruction" assert. However, there is now a way to avoid it.
2020-01-09 01:53:15 +08:00
assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
"Expected G_CONSTANT");
uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
MIB.addImm(Enc);
}
bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
const MachineInstr &MI, unsigned NumBytes) const {
if (!MI.mayLoadOrStore())
return false;
assert(MI.hasOneMemOperand() &&
"Expected load/store to have only one mem op!");
return (*MI.memoperands_begin())->getSize() == NumBytes;
}
bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
return false;
// Only return true if we know the operation will zero-out the high half of
// the 64-bit register. Truncates can be subregister copies, which don't
// zero out the high bits. Copies and other copy-like instructions can be
// fed by truncates, or could be lowered as subregister copies.
switch (MI.getOpcode()) {
default:
return true;
case TargetOpcode::COPY:
case TargetOpcode::G_BITCAST:
case TargetOpcode::G_TRUNC:
case TargetOpcode::G_PHI:
return false;
}
}
// Perform fixups on the given PHI instruction's operands to force them all
// to be the same as the destination regbank.
static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
const AArch64RegisterBankInfo &RBI) {
assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
Register DstReg = MI.getOperand(0).getReg();
const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
assert(DstRB && "Expected PHI dst to have regbank assigned");
MachineIRBuilder MIB(MI);
// Go through each operand and ensure it has the same regbank.
for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
MachineOperand &MO = MI.getOperand(OpIdx);
if (!MO.isReg())
continue;
Register OpReg = MO.getReg();
const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
if (RB != DstRB) {
// Insert a cross-bank copy.
auto *OpDef = MRI.getVRegDef(OpReg);
const LLT &Ty = MRI.getType(OpReg);
MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator()));
auto Copy = MIB.buildCopy(Ty, OpReg);
MRI.setRegBank(Copy.getReg(0), *DstRB);
MO.setReg(Copy.getReg(0));
}
}
}
void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
// We're looking for PHIs, build a list so we don't invalidate iterators.
MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<MachineInstr *, 32> Phis;
for (auto &BB : MF) {
for (auto &MI : BB) {
if (MI.getOpcode() == TargetOpcode::G_PHI)
Phis.emplace_back(&MI);
}
}
for (auto *MI : Phis) {
// We need to do some work here if the operand types are < 16 bit and they
// are split across fpr/gpr banks. Since all types <32b on gpr
// end up being assigned gpr32 regclasses, we can end up with PHIs here
// which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
// be selecting heterogenous regbanks for operands if possible, but we
// still need to be able to deal with it here.
//
// To fix this, if we have a gpr-bank operand < 32b in size and at least
// one other operand is on the fpr bank, then we add cross-bank copies
// to homogenize the operand banks. For simplicity the bank that we choose
// to settle on is whatever bank the def operand has. For example:
//
// %endbb:
// %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
// =>
// %bb2:
// ...
// %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
// ...
// %endbb:
// %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
bool HasGPROp = false, HasFPROp = false;
for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
const auto &MO = MI->getOperand(OpIdx);
if (!MO.isReg())
continue;
const LLT &Ty = MRI.getType(MO.getReg());
if (!Ty.isValid() || !Ty.isScalar())
break;
if (Ty.getSizeInBits() >= 32)
break;
const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
// If for some reason we don't have a regbank yet. Don't try anything.
if (!RB)
break;
if (RB->getID() == AArch64::GPRRegBankID)
HasGPROp = true;
else
HasFPROp = true;
}
// We have heterogenous regbanks, need to fixup.
if (HasGPROp && HasFPROp)
fixupPHIOpBanks(*MI, MRI, RBI);
}
}
namespace llvm {
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &TM,
AArch64Subtarget &Subtarget,
AArch64RegisterBankInfo &RBI) {
return new AArch64InstructionSelector(TM, Subtarget, RBI);
}
}