[GlobalISel] Combine (a[0]) | (a[1] << k1) | ...| (a[m] << kn) into a wide load

This is a restricted version of the combine in `DAGCombiner::MatchLoadCombine`.
(See D27861)

This tries to recognize patterns like below (assuming a little-endian target):

```
s8* x = ...
s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
->
s32 val = *((i32)a)

s8* x = ...
s32 val = a[3] | (a[2] << 8) | (a[1] << 16) | (a[0] << 24)
->
s32 val = BSWAP(*((s32)a))
```

(This patch also handles the big-endian target case as well, in which the first
example above has a BSWAP, and the second example above does not.)

To recognize the pattern, this searches from the last G_OR in the expression
tree.

E.g.

```
    Reg   Reg
     \    /
      OR_1   Reg
       \    /
        OR_2
          \     Reg
           .. /
          Root
```

Each non-OR register in the tree is put in a list. Each register in the list is
then checked to see if it's an appropriate load + shift logic.

If every register is a load + potentially a shift, the combine checks if those
loads + shifts, when OR'd together, are equivalent to a wide load (possibly with
a BSWAP.)

To simplify things, this patch

(1) Only handles G_ZEXTLOADs (which appear to be the common case)
(2) Only works in a single MachineBasicBlock
(3) Only handles G_SHL as the bit twiddling to stick the small load into a
    specific location

An IR example of this is here: https://godbolt.org/z/4sP9Pj (lifted from
test/CodeGen/AArch64/load-combine.ll)

At -Os on AArch64, this is a 0.5% code size improvement for CTMark/sqlite3,
and a 0.4% improvement for CTMark/7zip-benchmark.

Also fix a bug in `isPredecessor` which caused it to fail whenever `DefMI` was
the first instruction in the block.

Differential Revision: https://reviews.llvm.org/D94350
This commit is contained in:
Jessica Paquette 2020-12-18 12:56:14 -08:00
parent 9c6a00fe99
commit cfc6073017
7 changed files with 2136 additions and 8 deletions

View File

@ -18,6 +18,7 @@
#define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/LowLevelType.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/Support/Alignment.h"
@ -471,6 +472,20 @@ public:
bool applyCombineInsertVecElts(MachineInstr &MI,
SmallVectorImpl<Register> &MatchInfo);
/// Match expression trees of the form
///
/// \code
/// sN *a = ...
/// sM val = a[0] | (a[1] << N) | (a[2] << 2N) | (a[3] << 3N) ...
/// \endcode
///
/// And check if the tree can be replaced with a M-bit load + possibly a
/// bswap.
bool matchLoadOrCombine(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo);
bool applyLoadOrCombine(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo);
/// Try to transform \p MI by using all of the above
/// combine functions. Returns true if changed.
bool tryCombine(MachineInstr &MI);
@ -499,6 +514,30 @@ private:
/// \returns true if a candidate is found.
bool findPreIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base,
Register &Offset);
/// Helper function for matchLoadOrCombine. Searches for Registers
/// which may have been produced by a load instruction + some arithmetic.
///
/// \param [in] Root - The search root.
///
/// \returns The Registers found during the search.
Optional<SmallVector<Register, 8>>
findCandidatesForLoadOrCombine(const MachineInstr *Root) const;
/// Helper function for matchLoadOrCombine.
///
/// Checks if every register in \p RegsToVisit is defined by a load
/// instruction + some arithmetic.
///
/// \param [out] MemOffset2Idx - Maps the byte positions each load ends up
/// at to the index of the load.
/// \param [in] MemSizeInBits - The number of bits each load should produce.
///
/// \returns The lowest-index load found and the lowest index on success.
Optional<std::pair<MachineInstr *, int64_t>> findLoadOffsetsForLoadOrCombine(
SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
const SmallVector<Register, 8> &RegsToVisit,
const unsigned MemSizeInBits);
};
} // namespace llvm

View File

@ -1658,6 +1658,11 @@ public:
const MachineMemOperand &MMO,
bool *Fast = nullptr) const;
/// LLT handling variant.
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, LLT Ty,
const MachineMemOperand &MMO,
bool *Fast = nullptr) const;
/// Returns the target specific optimal type for load and store operations as
/// a result of memset, memcpy, and memmove lowering.
/// It returns EVT::Other if the type should be determined using generic

View File

@ -545,6 +545,14 @@ def combine_insert_vec_elts_build_vector : GICombineRule<
[{ return Helper.matchCombineInsertVecElts(*${root}, ${info}); }]),
(apply [{ return Helper.applyCombineInsertVecElts(*${root}, ${info}); }])>;
def load_or_combine_matchdata :
GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
def load_or_combine : GICombineRule<
(defs root:$root, load_or_combine_matchdata:$info),
(match (wip_match_opcode G_OR):$root,
[{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
(apply [{ return Helper.applyLoadOrCombine(*${root}, ${info}); }])>;
// Currently only the one combine above.
def insert_vec_elt_combines : GICombineGroup<
[combine_insert_vec_elts_build_vector]>;
@ -587,4 +595,4 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc,
unmerge_zext_to_zext, trunc_ext_fold, trunc_shl,
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
shift_immed_chain, shift_of_shifted_logic_chain]>;
shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>;

View File

@ -48,6 +48,66 @@ const TargetLowering &CombinerHelper::getTargetLowering() const {
return *Builder.getMF().getSubtarget().getTargetLowering();
}
/// \returns The little endian in-memory byte position of byte \p I in a
/// \p ByteWidth bytes wide type.
///
/// E.g. Given a 4-byte type x, x[0] -> byte 0
static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) {
assert(I < ByteWidth && "I must be in [0, ByteWidth)");
return I;
}
/// \returns The big endian in-memory byte position of byte \p I in a
/// \p ByteWidth bytes wide type.
///
/// E.g. Given a 4-byte type x, x[0] -> byte 3
static unsigned bigEndianByteAt(const unsigned ByteWidth, const unsigned I) {
assert(I < ByteWidth && "I must be in [0, ByteWidth)");
return ByteWidth - I - 1;
}
/// Given a map from byte offsets in memory to indices in a load/store,
/// determine if that map corresponds to a little or big endian byte pattern.
///
/// \param MemOffset2Idx maps memory offsets to address offsets.
/// \param LowestIdx is the lowest index in \p MemOffset2Idx.
///
/// \returns true if the map corresponds to a big endian byte pattern, false
/// if it corresponds to a little endian byte pattern, and None otherwise.
///
/// E.g. given a 32-bit type x, and x[AddrOffset], the in-memory byte patterns
/// are as follows:
///
/// AddrOffset Little endian Big endian
/// 0 0 3
/// 1 1 2
/// 2 2 1
/// 3 3 0
static Optional<bool>
isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
int64_t LowestIdx) {
// Need at least two byte positions to decide on endianness.
unsigned Width = MemOffset2Idx.size();
if (Width < 2)
return None;
bool BigEndian = true, LittleEndian = true;
for (unsigned MemOffset = 0; MemOffset < Width; ++ MemOffset) {
auto MemOffsetAndIdx = MemOffset2Idx.find(MemOffset);
if (MemOffsetAndIdx == MemOffset2Idx.end())
return None;
const int64_t Idx = MemOffsetAndIdx->second - LowestIdx;
assert(Idx >= 0 && "Expected non-negative byte offset?");
LittleEndian &= Idx == littleEndianByteAt(Width, MemOffset);
BigEndian &= Idx == bigEndianByteAt(Width, MemOffset);
if (!BigEndian && !LittleEndian)
return None;
}
assert((BigEndian != LittleEndian) &&
"Pattern cannot be both big and little endian!");
return BigEndian;
}
bool CombinerHelper::isLegalOrBeforeLegalizer(
const LegalityQuery &Query) const {
return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
@ -564,13 +624,16 @@ bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
assert(DefMI.getParent() == UseMI.getParent());
if (&DefMI == &UseMI)
return false;
// Loop through the basic block until we find one of the instructions.
MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
for (; &*I != &DefMI && &*I != &UseMI; ++I)
return &*I == &DefMI;
llvm_unreachable("Block must contain instructions");
const MachineBasicBlock &MBB = *DefMI.getParent();
auto NonDbgInsts =
instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
auto DefOrUse =
find_if(NonDbgInsts, [&DefMI, &UseMI](const MachineInstr &MI) {
return &MI == &DefMI || &MI == &UseMI;
});
if (DefOrUse == NonDbgInsts.end())
llvm_unreachable("Block must contain both DefMI and UseMI!");
return &*DefOrUse == &DefMI;
}
bool CombinerHelper::dominates(const MachineInstr &DefMI,
@ -3152,6 +3215,361 @@ bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
return true;
}
Optional<SmallVector<Register, 8>>
CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const {
assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!");
// We want to detect if Root is part of a tree which represents a bunch
// of loads being merged into a larger load. We'll try to recognize patterns
// like, for example:
//
// Reg Reg
// \ /
// OR_1 Reg
// \ /
// OR_2
// \ Reg
// .. /
// Root
//
// Reg Reg Reg Reg
// \ / \ /
// OR_1 OR_2
// \ /
// \ /
// ...
// Root
//
// Each "Reg" may have been produced by a load + some arithmetic. This
// function will save each of them.
SmallVector<Register, 8> RegsToVisit;
SmallVector<const MachineInstr *, 7> Ors = {Root};
// In the "worst" case, we're dealing with a load for each byte. So, there
// are at most #bytes - 1 ORs.
const unsigned MaxIter =
MRI.getType(Root->getOperand(0).getReg()).getSizeInBytes() - 1;
for (unsigned Iter = 0; Iter < MaxIter; ++Iter) {
if (Ors.empty())
break;
const MachineInstr *Curr = Ors.pop_back_val();
Register OrLHS = Curr->getOperand(1).getReg();
Register OrRHS = Curr->getOperand(2).getReg();
// In the combine, we want to elimate the entire tree.
if (!MRI.hasOneNonDBGUse(OrLHS) || !MRI.hasOneNonDBGUse(OrRHS))
return None;
// If it's a G_OR, save it and continue to walk. If it's not, then it's
// something that may be a load + arithmetic.
if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrLHS, MRI))
Ors.push_back(Or);
else
RegsToVisit.push_back(OrLHS);
if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrRHS, MRI))
Ors.push_back(Or);
else
RegsToVisit.push_back(OrRHS);
}
// We're going to try and merge each register into a wider power-of-2 type,
// so we ought to have an even number of registers.
if (RegsToVisit.empty() || RegsToVisit.size() % 2 != 0)
return None;
return RegsToVisit;
}
/// Helper function for findLoadOffsetsForLoadOrCombine.
///
/// Check if \p Reg is the result of loading a \p MemSizeInBits wide value,
/// and then moving that value into a specific byte offset.
///
/// e.g. x[i] << 24
///
/// \returns The load instruction and the byte offset it is moved into.
static Optional<std::pair<MachineInstr *, int64_t>>
matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits,
const MachineRegisterInfo &MRI) {
assert(MRI.hasOneNonDBGUse(Reg) &&
"Expected Reg to only have one non-debug use?");
Register MaybeLoad;
int64_t Shift;
if (!mi_match(Reg, MRI,
m_OneNonDBGUse(m_GShl(m_Reg(MaybeLoad), m_ICst(Shift))))) {
Shift = 0;
MaybeLoad = Reg;
}
if (Shift % MemSizeInBits != 0)
return None;
// TODO: Handle other types of loads.
auto *Load = getOpcodeDef(TargetOpcode::G_ZEXTLOAD, MaybeLoad, MRI);
if (!Load)
return None;
const auto &MMO = **Load->memoperands_begin();
if (!MMO.isUnordered() || MMO.getSizeInBits() != MemSizeInBits)
return None;
return std::make_pair(Load, Shift / MemSizeInBits);
}
Optional<std::pair<MachineInstr *, int64_t>>
CombinerHelper::findLoadOffsetsForLoadOrCombine(
SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
const SmallVector<Register, 8> &RegsToVisit, const unsigned MemSizeInBits) {
// Each load found for the pattern. There should be one for each RegsToVisit.
SmallSetVector<const MachineInstr *, 8> Loads;
// The lowest index used in any load. (The lowest "i" for each x[i].)
int64_t LowestIdx = INT64_MAX;
// The load which uses the lowest index.
MachineInstr *LowestIdxLoad = nullptr;
// Keeps track of the load indices we see. We shouldn't see any indices twice.
SmallSet<int64_t, 8> SeenIdx;
// Ensure each load is in the same MBB.
// TODO: Support multiple MachineBasicBlocks.
MachineBasicBlock *MBB = nullptr;
const MachineMemOperand *MMO = nullptr;
// Earliest instruction-order load in the pattern.
MachineInstr *EarliestLoad = nullptr;
// Latest instruction-order load in the pattern.
MachineInstr *LatestLoad = nullptr;
// Base pointer which every load should share.
Register BasePtr;
// We want to find a load for each register. Each load should have some
// appropriate bit twiddling arithmetic. During this loop, we will also keep
// track of the load which uses the lowest index. Later, we will check if we
// can use its pointer in the final, combined load.
for (auto Reg : RegsToVisit) {
// Find the load, and find the position that it will end up in (e.g. a
// shifted) value.
auto LoadAndPos = matchLoadAndBytePosition(Reg, MemSizeInBits, MRI);
if (!LoadAndPos)
return None;
MachineInstr *Load;
int64_t DstPos;
std::tie(Load, DstPos) = *LoadAndPos;
// TODO: Handle multiple MachineBasicBlocks. Currently not handled because
// it is difficult to check for stores/calls/etc between loads.
MachineBasicBlock *LoadMBB = Load->getParent();
if (!MBB)
MBB = LoadMBB;
if (LoadMBB != MBB)
return None;
// Make sure that the MachineMemOperands of every seen load are compatible.
const MachineMemOperand *LoadMMO = *Load->memoperands_begin();
if (!MMO)
MMO = LoadMMO;
if (MMO->getAddrSpace() != LoadMMO->getAddrSpace())
return None;
// Find out what the base pointer and index for the load is.
Register LoadPtr;
int64_t Idx;
if (!mi_match(Load->getOperand(1).getReg(), MRI,
m_GPtrAdd(m_Reg(LoadPtr), m_ICst(Idx)))) {
LoadPtr = Load->getOperand(1).getReg();
Idx = 0;
}
// Don't combine things like a[i], a[i] -> a bigger load.
if (!SeenIdx.insert(Idx).second)
return None;
// Every load must share the same base pointer; don't combine things like:
//
// a[i], b[i + 1] -> a bigger load.
if (!BasePtr.isValid())
BasePtr = LoadPtr;
if (BasePtr != LoadPtr)
return None;
if (Idx < LowestIdx) {
LowestIdx = Idx;
LowestIdxLoad = Load;
}
// Keep track of the byte offset that this load ends up at. If we have seen
// the byte offset, then stop here. We do not want to combine:
//
// a[i] << 16, a[i + k] << 16 -> a bigger load.
if (!MemOffset2Idx.try_emplace(DstPos, Idx).second)
return None;
Loads.insert(Load);
// Keep track of the position of the earliest/latest loads in the pattern.
// We will check that there are no load fold barriers between them later
// on.
//
// FIXME: Is there a better way to check for load fold barriers?
if (!EarliestLoad || dominates(*Load, *EarliestLoad))
EarliestLoad = Load;
if (!LatestLoad || dominates(*LatestLoad, *Load))
LatestLoad = Load;
}
// We found a load for each register. Let's check if each load satisfies the
// pattern.
assert(Loads.size() == RegsToVisit.size() &&
"Expected to find a load for each register?");
assert(EarliestLoad != LatestLoad && EarliestLoad &&
LatestLoad && "Expected at least two loads?");
// Check if there are any stores, calls, etc. between any of the loads. If
// there are, then we can't safely perform the combine.
//
// MaxIter is chosen based off the (worst case) number of iterations it
// typically takes to succeed in the LLVM test suite plus some padding.
//
// FIXME: Is there a better way to check for load fold barriers?
const unsigned MaxIter = 20;
unsigned Iter = 0;
for (const auto &MI : instructionsWithoutDebug(EarliestLoad->getIterator(),
LatestLoad->getIterator())) {
if (Loads.count(&MI))
continue;
if (MI.isLoadFoldBarrier())
return None;
if (Iter++ == MaxIter)
return None;
}
return std::make_pair(LowestIdxLoad, LowestIdx);
}
bool CombinerHelper::matchLoadOrCombine(
MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_OR);
MachineFunction &MF = *MI.getMF();
// Assuming a little-endian target, transform:
// s8 *a = ...
// s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
// =>
// s32 val = *((i32)a)
//
// s8 *a = ...
// s32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
// =>
// s32 val = BSWAP(*((s32)a))
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
if (Ty.isVector())
return false;
// We need to combine at least two loads into this type. Since the smallest
// possible load is into a byte, we need at least a 16-bit wide type.
const unsigned WideMemSizeInBits = Ty.getSizeInBits();
if (WideMemSizeInBits < 16 || WideMemSizeInBits % 8 != 0)
return false;
// Match a collection of non-OR instructions in the pattern.
auto RegsToVisit = findCandidatesForLoadOrCombine(&MI);
if (!RegsToVisit)
return false;
// We have a collection of non-OR instructions. Figure out how wide each of
// the small loads should be based off of the number of potential loads we
// found.
const unsigned NarrowMemSizeInBits = WideMemSizeInBits / RegsToVisit->size();
if (NarrowMemSizeInBits % 8 != 0)
return false;
// Check if each register feeding into each OR is a load from the same
// base pointer + some arithmetic.
//
// e.g. a[0], a[1] << 8, a[2] << 16, etc.
//
// Also verify that each of these ends up putting a[i] into the same memory
// offset as a load into a wide type would.
SmallDenseMap<int64_t, int64_t, 8> MemOffset2Idx;
MachineInstr *LowestIdxLoad;
int64_t LowestIdx;
auto MaybeLoadInfo = findLoadOffsetsForLoadOrCombine(
MemOffset2Idx, *RegsToVisit, NarrowMemSizeInBits);
if (!MaybeLoadInfo)
return false;
std::tie(LowestIdxLoad, LowestIdx) = *MaybeLoadInfo;
// We have a bunch of loads being OR'd together. Using the addresses + offsets
// we found before, check if this corresponds to a big or little endian byte
// pattern. If it does, then we can represent it using a load + possibly a
// BSWAP.
bool IsBigEndianTarget = MF.getDataLayout().isBigEndian();
Optional<bool> IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx);
if (!IsBigEndian.hasValue())
return false;
bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian;
if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}}))
return false;
// Make sure that the load from the lowest index produces offset 0 in the
// final value.
//
// This ensures that we won't combine something like this:
//
// load x[i] -> byte 2
// load x[i+1] -> byte 0 ---> wide_load x[i]
// load x[i+2] -> byte 1
const unsigned NumLoadsInTy = WideMemSizeInBits / NarrowMemSizeInBits;
const unsigned ZeroByteOffset =
*IsBigEndian
? bigEndianByteAt(NumLoadsInTy, 0)
: littleEndianByteAt(NumLoadsInTy, 0);
auto ZeroOffsetIdx = MemOffset2Idx.find(ZeroByteOffset);
if (ZeroOffsetIdx == MemOffset2Idx.end() ||
ZeroOffsetIdx->second != LowestIdx)
return false;
// We wil reuse the pointer from the load which ends up at byte offset 0. It
// may not use index 0.
Register Ptr = LowestIdxLoad->getOperand(1).getReg();
const MachineMemOperand &MMO = **LowestIdxLoad->memoperands_begin();
LegalityQuery::MemDesc MMDesc;
MMDesc.SizeInBits = WideMemSizeInBits;
MMDesc.AlignInBits = MMO.getAlign().value() * 8;
MMDesc.Ordering = MMO.getOrdering();
if (!isLegalOrBeforeLegalizer(
{TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}}))
return false;
auto PtrInfo = MMO.getPointerInfo();
auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, WideMemSizeInBits / 8);
// Load must be allowed and fast on the target.
LLVMContext &C = MF.getFunction().getContext();
auto &DL = MF.getDataLayout();
bool Fast = false;
if (!getTargetLowering().allowsMemoryAccess(C, DL, Ty, *NewMMO, &Fast) ||
!Fast)
return false;
MatchInfo = [=](MachineIRBuilder &MIB) {
Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst;
MIB.buildLoad(LoadDst, Ptr, *NewMMO);
if (NeedsBSwap)
MIB.buildBSwap(Dst, LoadDst);
};
return true;
}
bool CombinerHelper::applyLoadOrCombine(
MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
Builder.setInstrAndDebugLoc(MI);
MatchInfo(Builder);
MI.eraseFromParent();
return true;
}
bool CombinerHelper::tryCombine(MachineInstr &MI) {
if (tryCombineCopy(MI))
return true;

View File

@ -1756,6 +1756,14 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
MMO.getFlags(), Fast);
}
bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
const DataLayout &DL, LLT Ty,
const MachineMemOperand &MMO,
bool *Fast) const {
return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(),
MMO.getAlign(), MMO.getFlags(), Fast);
}
BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
return BranchProbability(MinPercentageForPredictableBranch, 100);
}

View File

@ -0,0 +1,79 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NOT_STRICT
# RUN: llc -debugify-and-strip-all-safe -mattr=+strict-align -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=STRICT
# REQUIRES: asserts
# Check that the load-or combine respects alignment requirements.
...
---
name: misaligned
tracksRegLiveness: true
body: |
bb.0:
liveins: $x0, $x1
; NOT_STRICT-LABEL: name: misaligned
; NOT_STRICT: liveins: $x0, $x1
; NOT_STRICT: %ptr:_(p0) = COPY $x1
; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
; NOT_STRICT: $w1 = COPY %full_load(s32)
; NOT_STRICT: RET_ReallyLR implicit $w1
; STRICT-LABEL: name: misaligned
; STRICT: liveins: $x0, $x1
; STRICT: %cst_1:_(s64) = G_CONSTANT i64 1
; STRICT: %cst_16:_(s32) = G_CONSTANT i32 16
; STRICT: %ptr:_(p0) = COPY $x1
; STRICT: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
; STRICT: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
; STRICT: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
; STRICT: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
; STRICT: %full_load:_(s32) = G_OR %low_half, %high_half
; STRICT: $w1 = COPY %full_load(s32)
; STRICT: RET_ReallyLR implicit $w1
%cst_1:_(s64) = G_CONSTANT i64 1
%cst_16:_(s32) = G_CONSTANT i32 16
%ptr:_(p0) = COPY $x1
%ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
%low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 2)
%elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 2)
%high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
%full_load:_(s32) = G_OR %low_half, %high_half
$w1 = COPY %full_load(s32)
RET_ReallyLR implicit $w1
...
---
name: aligned
tracksRegLiveness: true
body: |
bb.0:
liveins: $x0, $x1
; NOT_STRICT-LABEL: name: aligned
; NOT_STRICT: liveins: $x0, $x1
; NOT_STRICT: %ptr:_(p0) = COPY $x1
; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
; NOT_STRICT: $w1 = COPY %full_load(s32)
; NOT_STRICT: RET_ReallyLR implicit $w1
; STRICT-LABEL: name: aligned
; STRICT: liveins: $x0, $x1
; STRICT: %ptr:_(p0) = COPY $x1
; STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
; STRICT: $w1 = COPY %full_load(s32)
; STRICT: RET_ReallyLR implicit $w1
%cst_1:_(s64) = G_CONSTANT i64 1
%cst_16:_(s32) = G_CONSTANT i32 16
%ptr:_(p0) = COPY $x1
%ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
%low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 4)
%elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 4)
%high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
%full_load:_(s32) = G_OR %low_half, %high_half
$w1 = COPY %full_load(s32)
RET_ReallyLR implicit $w1

File diff suppressed because it is too large Load Diff