[GlobalISel] Combine (a[0]) | (a[1] << k1) | ...| (a[m] << kn) into a wide load

This is a restricted version of the combine in `DAGCombiner::MatchLoadCombine`. (See D27861) This tries to recognize patterns like below (assuming a little-endian target): ``` s8* x = ... s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) -> s32 val = *((i32)a) s8* x = ... s32 val = a[3] | (a[2] << 8) | (a[1] << 16) | (a[0] << 24) -> s32 val = BSWAP(*((s32)a)) ``` (This patch also handles the big-endian target case as well, in which the first example above has a BSWAP, and the second example above does not.) To recognize the pattern, this searches from the last G_OR in the expression tree. E.g. ``` Reg Reg \ / OR_1 Reg \ / OR_2 \ Reg .. / Root ``` Each non-OR register in the tree is put in a list. Each register in the list is then checked to see if it's an appropriate load + shift logic. If every register is a load + potentially a shift, the combine checks if those loads + shifts, when OR'd together, are equivalent to a wide load (possibly with a BSWAP.) To simplify things, this patch (1) Only handles G_ZEXTLOADs (which appear to be the common case) (2) Only works in a single MachineBasicBlock (3) Only handles G_SHL as the bit twiddling to stick the small load into a specific location An IR example of this is here: https://godbolt.org/z/4sP9Pj (lifted from test/CodeGen/AArch64/load-combine.ll) At -Os on AArch64, this is a 0.5% code size improvement for CTMark/sqlite3, and a 0.4% improvement for CTMark/7zip-benchmark. Also fix a bug in `isPredecessor` which caused it to fail whenever `DefMI` was the first instruction in the block. Differential Revision: https://reviews.llvm.org/D94350
2020-12-18 12:56:14 -08:00 · 2020-12-18 12:56:14 -08:00 · cfc6073017
parent 9c6a00fe99
commit cfc6073017
7 changed files with 2136 additions and 8 deletions
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@ -18,6 +18,7 @@
 #define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H

 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Support/Alignment.h"
@ -471,6 +472,20 @@ public:
  bool applyCombineInsertVecElts(MachineInstr &MI,
                             SmallVectorImpl<Register> &MatchInfo);

+  /// Match expression trees of the form
+  ///
+  /// \code
+  ///  sN *a = ...
+  ///  sM val = a[0] | (a[1] << N) | (a[2] << 2N) | (a[3] << 3N) ...
+  /// \endcode
+  ///
+  /// And check if the tree can be replaced with a M-bit load + possibly a
+  /// bswap.
+  bool matchLoadOrCombine(MachineInstr &MI,
+                          std::function<void(MachineIRBuilder &)> &MatchInfo);
+  bool applyLoadOrCombine(MachineInstr &MI,
+                          std::function<void(MachineIRBuilder &)> &MatchInfo);
+
  /// Try to transform \p MI by using all of the above
  /// combine functions. Returns true if changed.
  bool tryCombine(MachineInstr &MI);
@ -499,6 +514,30 @@ private:
  /// \returns true if a candidate is found.
  bool findPreIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base,
                             Register &Offset);
+
+  /// Helper function for matchLoadOrCombine. Searches for Registers
+  /// which may have been produced by a load instruction + some arithmetic.
+  ///
+  /// \param [in] Root - The search root.
+  ///
+  /// \returns The Registers found during the search.
+  Optional<SmallVector<Register, 8>>
+  findCandidatesForLoadOrCombine(const MachineInstr *Root) const;
+
+  /// Helper function for matchLoadOrCombine.
+  ///
+  /// Checks if every register in \p RegsToVisit is defined by a load
+  /// instruction + some arithmetic.
+  ///
+  /// \param [out] MemOffset2Idx - Maps the byte positions each load ends up
+  /// at to the index of the load.
+  /// \param [in] MemSizeInBits - The number of bits each load should produce.
+  ///
+  /// \returns The lowest-index load found and the lowest index on success.
+  Optional<std::pair<MachineInstr *, int64_t>> findLoadOffsetsForLoadOrCombine(
+      SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+      const SmallVector<Register, 8> &RegsToVisit,
+      const unsigned MemSizeInBits);
 };
 } // namespace llvm

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@ -1658,6 +1658,11 @@ public:
                          const MachineMemOperand &MMO,
                          bool *Fast = nullptr) const;

+  /// LLT handling variant.
+  bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, LLT Ty,
+                          const MachineMemOperand &MMO,
+                          bool *Fast = nullptr) const;
+
  /// Returns the target specific optimal type for load and store operations as
  /// a result of memset, memcpy, and memmove lowering.
  /// It returns EVT::Other if the type should be determined using generic
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@ -545,6 +545,14 @@ def combine_insert_vec_elts_build_vector : GICombineRule<
    [{ return Helper.matchCombineInsertVecElts(*${root}, ${info}); }]),
  (apply [{ return Helper.applyCombineInsertVecElts(*${root}, ${info}); }])>;

+def load_or_combine_matchdata :
+GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
+def load_or_combine : GICombineRule<
+  (defs root:$root, load_or_combine_matchdata:$info),
+  (match (wip_match_opcode G_OR):$root,
+    [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyLoadOrCombine(*${root}, ${info}); }])>;
+
 // Currently only the one combine above.
 def insert_vec_elt_combines : GICombineGroup<
                            [combine_insert_vec_elts_build_vector]>;
@ -587,4 +595,4 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
    unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc,
    unmerge_zext_to_zext, trunc_ext_fold, trunc_shl,
    const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
-    shift_immed_chain, shift_of_shifted_logic_chain]>;
+    shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>;
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@ -48,6 +48,66 @@ const TargetLowering &CombinerHelper::getTargetLowering() const {
  return *Builder.getMF().getSubtarget().getTargetLowering();
 }

+/// \returns The little endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 0
+static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return I;
+}
+
+/// \returns The big endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 3
+static unsigned bigEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return ByteWidth - I - 1;
+}
+
+/// Given a map from byte offsets in memory to indices in a load/store,
+/// determine if that map corresponds to a little or big endian byte pattern.
+///
+/// \param MemOffset2Idx maps memory offsets to address offsets.
+/// \param LowestIdx is the lowest index in \p MemOffset2Idx.
+///
+/// \returns true if the map corresponds to a big endian byte pattern, false
+/// if it corresponds to a little endian byte pattern, and None otherwise.
+///
+/// E.g. given a 32-bit type x, and x[AddrOffset], the in-memory byte patterns
+/// are as follows:
+///
+/// AddrOffset   Little endian    Big endian
+/// 0            0                3
+/// 1            1                2
+/// 2            2                1
+/// 3            3                0
+static Optional<bool>
+isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+            int64_t LowestIdx) {
+  // Need at least two byte positions to decide on endianness.
+  unsigned Width = MemOffset2Idx.size();
+  if (Width < 2)
+    return None;
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned MemOffset = 0; MemOffset < Width; ++ MemOffset) {
+    auto MemOffsetAndIdx = MemOffset2Idx.find(MemOffset);
+    if (MemOffsetAndIdx == MemOffset2Idx.end())
+      return None;
+    const int64_t Idx = MemOffsetAndIdx->second - LowestIdx;
+    assert(Idx >= 0 && "Expected non-negative byte offset?");
+    LittleEndian &= Idx == littleEndianByteAt(Width, MemOffset);
+    BigEndian &= Idx == bigEndianByteAt(Width, MemOffset);
+    if (!BigEndian && !LittleEndian)
+      return None;
+  }
+
+  assert((BigEndian != LittleEndian) &&
+         "Pattern cannot be both big and little endian!");
+  return BigEndian;
+}
+
 bool CombinerHelper::isLegalOrBeforeLegalizer(
    const LegalityQuery &Query) const {
  return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
@ -564,13 +624,16 @@ bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
  assert(DefMI.getParent() == UseMI.getParent());
  if (&DefMI == &UseMI)
    return false;
-
-  // Loop through the basic block until we find one of the instructions.
-  MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
-  for (; &*I != &DefMI && &*I != &UseMI; ++I)
-    return &*I == &DefMI;
-
-  llvm_unreachable("Block must contain instructions");
+  const MachineBasicBlock &MBB = *DefMI.getParent();
+  auto NonDbgInsts =
+      instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
+  auto DefOrUse =
+      find_if(NonDbgInsts, [&DefMI, &UseMI](const MachineInstr &MI) {
+        return &MI == &DefMI || &MI == &UseMI;
+      });
+  if (DefOrUse == NonDbgInsts.end())
+    llvm_unreachable("Block must contain both DefMI and UseMI!");
+  return &*DefOrUse == &DefMI;
 }

 bool CombinerHelper::dominates(const MachineInstr &DefMI,
@ -3152,6 +3215,361 @@ bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
  return true;
 }

+Optional<SmallVector<Register, 8>>
+CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const {
+  assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!");
+  // We want to detect if Root is part of a tree which represents a bunch
+  // of loads being merged into a larger load. We'll try to recognize patterns
+  // like, for example:
+  //
+  //  Reg   Reg
+  //   \    /
+  //    OR_1   Reg
+  //     \    /
+  //      OR_2
+  //        \     Reg
+  //         .. /
+  //        Root
+  //
+  //  Reg   Reg   Reg   Reg
+  //     \ /       \   /
+  //     OR_1      OR_2
+  //       \       /
+  //        \    /
+  //         ...
+  //         Root
+  //
+  // Each "Reg" may have been produced by a load + some arithmetic. This
+  // function will save each of them.
+  SmallVector<Register, 8> RegsToVisit;
+  SmallVector<const MachineInstr *, 7> Ors = {Root};
+
+  // In the "worst" case, we're dealing with a load for each byte. So, there
+  // are at most #bytes - 1 ORs.
+  const unsigned MaxIter =
+      MRI.getType(Root->getOperand(0).getReg()).getSizeInBytes() - 1;
+  for (unsigned Iter = 0; Iter < MaxIter; ++Iter) {
+    if (Ors.empty())
+      break;
+    const MachineInstr *Curr = Ors.pop_back_val();
+    Register OrLHS = Curr->getOperand(1).getReg();
+    Register OrRHS = Curr->getOperand(2).getReg();
+
+    // In the combine, we want to elimate the entire tree.
+    if (!MRI.hasOneNonDBGUse(OrLHS) || !MRI.hasOneNonDBGUse(OrRHS))
+      return None;
+
+    // If it's a G_OR, save it and continue to walk. If it's not, then it's
+    // something that may be a load + arithmetic.
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrLHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrLHS);
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrRHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrRHS);
+  }
+
+  // We're going to try and merge each register into a wider power-of-2 type,
+  // so we ought to have an even number of registers.
+  if (RegsToVisit.empty() || RegsToVisit.size() % 2 != 0)
+    return None;
+  return RegsToVisit;
+}
+
+/// Helper function for findLoadOffsetsForLoadOrCombine.
+///
+/// Check if \p Reg is the result of loading a \p MemSizeInBits wide value,
+/// and then moving that value into a specific byte offset.
+///
+/// e.g. x[i] << 24
+///
+/// \returns The load instruction and the byte offset it is moved into.
+static Optional<std::pair<MachineInstr *, int64_t>>
+matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits,
+                         const MachineRegisterInfo &MRI) {
+  assert(MRI.hasOneNonDBGUse(Reg) &&
+         "Expected Reg to only have one non-debug use?");
+  Register MaybeLoad;
+  int64_t Shift;
+  if (!mi_match(Reg, MRI,
+                m_OneNonDBGUse(m_GShl(m_Reg(MaybeLoad), m_ICst(Shift))))) {
+    Shift = 0;
+    MaybeLoad = Reg;
+  }
+
+  if (Shift % MemSizeInBits != 0)
+    return None;
+
+  // TODO: Handle other types of loads.
+  auto *Load = getOpcodeDef(TargetOpcode::G_ZEXTLOAD, MaybeLoad, MRI);
+  if (!Load)
+    return None;
+
+  const auto &MMO = **Load->memoperands_begin();
+  if (!MMO.isUnordered() || MMO.getSizeInBits() != MemSizeInBits)
+    return None;
+
+  return std::make_pair(Load, Shift / MemSizeInBits);
+}
+
+Optional<std::pair<MachineInstr *, int64_t>>
+CombinerHelper::findLoadOffsetsForLoadOrCombine(
+    SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+    const SmallVector<Register, 8> &RegsToVisit, const unsigned MemSizeInBits) {
+
+  // Each load found for the pattern. There should be one for each RegsToVisit.
+  SmallSetVector<const MachineInstr *, 8> Loads;
+
+  // The lowest index used in any load. (The lowest "i" for each x[i].)
+  int64_t LowestIdx = INT64_MAX;
+
+  // The load which uses the lowest index.
+  MachineInstr *LowestIdxLoad = nullptr;
+
+  // Keeps track of the load indices we see. We shouldn't see any indices twice.
+  SmallSet<int64_t, 8> SeenIdx;
+
+  // Ensure each load is in the same MBB.
+  // TODO: Support multiple MachineBasicBlocks.
+  MachineBasicBlock *MBB = nullptr;
+  const MachineMemOperand *MMO = nullptr;
+
+  // Earliest instruction-order load in the pattern.
+  MachineInstr *EarliestLoad = nullptr;
+
+  // Latest instruction-order load in the pattern.
+  MachineInstr *LatestLoad = nullptr;
+
+  // Base pointer which every load should share.
+  Register BasePtr;
+
+  // We want to find a load for each register. Each load should have some
+  // appropriate bit twiddling arithmetic. During this loop, we will also keep
+  // track of the load which uses the lowest index. Later, we will check if we
+  // can use its pointer in the final, combined load.
+  for (auto Reg : RegsToVisit) {
+    // Find the load, and find the position that it will end up in (e.g. a
+    // shifted) value.
+    auto LoadAndPos = matchLoadAndBytePosition(Reg, MemSizeInBits, MRI);
+    if (!LoadAndPos)
+      return None;
+    MachineInstr *Load;
+    int64_t DstPos;
+    std::tie(Load, DstPos) = *LoadAndPos;
+
+    // TODO: Handle multiple MachineBasicBlocks. Currently not handled because
+    // it is difficult to check for stores/calls/etc between loads.
+    MachineBasicBlock *LoadMBB = Load->getParent();
+    if (!MBB)
+      MBB = LoadMBB;
+    if (LoadMBB != MBB)
+      return None;
+
+    // Make sure that the MachineMemOperands of every seen load are compatible.
+    const MachineMemOperand *LoadMMO = *Load->memoperands_begin();
+    if (!MMO)
+      MMO = LoadMMO;
+    if (MMO->getAddrSpace() != LoadMMO->getAddrSpace())
+      return None;
+
+    // Find out what the base pointer and index for the load is.
+    Register LoadPtr;
+    int64_t Idx;
+    if (!mi_match(Load->getOperand(1).getReg(), MRI,
+                  m_GPtrAdd(m_Reg(LoadPtr), m_ICst(Idx)))) {
+      LoadPtr = Load->getOperand(1).getReg();
+      Idx = 0;
+    }
+
+    // Don't combine things like a[i], a[i] -> a bigger load.
+    if (!SeenIdx.insert(Idx).second)
+      return None;
+
+    // Every load must share the same base pointer; don't combine things like:
+    //
+    // a[i], b[i + 1] -> a bigger load.
+    if (!BasePtr.isValid())
+      BasePtr = LoadPtr;
+    if (BasePtr != LoadPtr)
+      return None;
+
+    if (Idx < LowestIdx) {
+      LowestIdx = Idx;
+      LowestIdxLoad = Load;
+    }
+
+    // Keep track of the byte offset that this load ends up at. If we have seen
+    // the byte offset, then stop here. We do not want to combine:
+    //
+    // a[i] << 16, a[i + k] << 16 -> a bigger load.
+    if (!MemOffset2Idx.try_emplace(DstPos, Idx).second)
+      return None;
+    Loads.insert(Load);
+
+    // Keep track of the position of the earliest/latest loads in the pattern.
+    // We will check that there are no load fold barriers between them later
+    // on.
+    //
+    // FIXME: Is there a better way to check for load fold barriers?
+    if (!EarliestLoad || dominates(*Load, *EarliestLoad))
+      EarliestLoad = Load;
+    if (!LatestLoad || dominates(*LatestLoad, *Load))
+      LatestLoad = Load;
+  }
+
+  // We found a load for each register. Let's check if each load satisfies the
+  // pattern.
+  assert(Loads.size() == RegsToVisit.size() &&
+         "Expected to find a load for each register?");
+  assert(EarliestLoad != LatestLoad && EarliestLoad &&
+         LatestLoad && "Expected at least two loads?");
+
+  // Check if there are any stores, calls, etc. between any of the loads. If
+  // there are, then we can't safely perform the combine.
+  //
+  // MaxIter is chosen based off the (worst case) number of iterations it
+  // typically takes to succeed in the LLVM test suite plus some padding.
+  //
+  // FIXME: Is there a better way to check for load fold barriers?
+  const unsigned MaxIter = 20;
+  unsigned Iter = 0;
+  for (const auto &MI : instructionsWithoutDebug(EarliestLoad->getIterator(),
+                                                 LatestLoad->getIterator())) {
+    if (Loads.count(&MI))
+      continue;
+    if (MI.isLoadFoldBarrier())
+      return None;
+    if (Iter++ == MaxIter)
+      return None;
+  }
+
+  return std::make_pair(LowestIdxLoad, LowestIdx);
+}
+
+bool CombinerHelper::matchLoadOrCombine(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+  MachineFunction &MF = *MI.getMF();
+  // Assuming a little-endian target, transform:
+  //  s8 *a = ...
+  //  s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+  // =>
+  //  s32 val = *((i32)a)
+  //
+  //  s8 *a = ...
+  //  s32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+  // =>
+  //  s32 val = BSWAP(*((s32)a))
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (Ty.isVector())
+    return false;
+
+  // We need to combine at least two loads into this type. Since the smallest
+  // possible load is into a byte, we need at least a 16-bit wide type.
+  const unsigned WideMemSizeInBits = Ty.getSizeInBits();
+  if (WideMemSizeInBits < 16 || WideMemSizeInBits % 8 != 0)
+    return false;
+
+  // Match a collection of non-OR instructions in the pattern.
+  auto RegsToVisit = findCandidatesForLoadOrCombine(&MI);
+  if (!RegsToVisit)
+    return false;
+
+  // We have a collection of non-OR instructions. Figure out how wide each of
+  // the small loads should be based off of the number of potential loads we
+  // found.
+  const unsigned NarrowMemSizeInBits = WideMemSizeInBits / RegsToVisit->size();
+  if (NarrowMemSizeInBits % 8 != 0)
+    return false;
+
+  // Check if each register feeding into each OR is a load from the same
+  // base pointer + some arithmetic.
+  //
+  // e.g. a[0], a[1] << 8, a[2] << 16, etc.
+  //
+  // Also verify that each of these ends up putting a[i] into the same memory
+  // offset as a load into a wide type would.
+  SmallDenseMap<int64_t, int64_t, 8> MemOffset2Idx;
+  MachineInstr *LowestIdxLoad;
+  int64_t LowestIdx;
+  auto MaybeLoadInfo = findLoadOffsetsForLoadOrCombine(
+      MemOffset2Idx, *RegsToVisit, NarrowMemSizeInBits);
+  if (!MaybeLoadInfo)
+    return false;
+  std::tie(LowestIdxLoad, LowestIdx) = *MaybeLoadInfo;
+
+  // We have a bunch of loads being OR'd together. Using the addresses + offsets
+  // we found before, check if this corresponds to a big or little endian byte
+  // pattern. If it does, then we can represent it using a load + possibly a
+  // BSWAP.
+  bool IsBigEndianTarget = MF.getDataLayout().isBigEndian();
+  Optional<bool> IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx);
+  if (!IsBigEndian.hasValue())
+    return false;
+  bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian;
+  if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}}))
+    return false;
+
+  // Make sure that the load from the lowest index produces offset 0 in the
+  // final value.
+  //
+  // This ensures that we won't combine something like this:
+  //
+  // load x[i] -> byte 2
+  // load x[i+1] -> byte 0 ---> wide_load x[i]
+  // load x[i+2] -> byte 1
+  const unsigned NumLoadsInTy = WideMemSizeInBits / NarrowMemSizeInBits;
+  const unsigned ZeroByteOffset =
+      *IsBigEndian
+          ? bigEndianByteAt(NumLoadsInTy, 0)
+          : littleEndianByteAt(NumLoadsInTy, 0);
+  auto ZeroOffsetIdx = MemOffset2Idx.find(ZeroByteOffset);
+  if (ZeroOffsetIdx == MemOffset2Idx.end() ||
+      ZeroOffsetIdx->second != LowestIdx)
+    return false;
+
+  // We wil reuse the pointer from the load which ends up at byte offset 0. It
+  // may not use index 0.
+  Register Ptr = LowestIdxLoad->getOperand(1).getReg();
+  const MachineMemOperand &MMO = **LowestIdxLoad->memoperands_begin();
+  LegalityQuery::MemDesc MMDesc;
+  MMDesc.SizeInBits = WideMemSizeInBits;
+  MMDesc.AlignInBits = MMO.getAlign().value() * 8;
+  MMDesc.Ordering = MMO.getOrdering();
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}}))
+    return false;
+  auto PtrInfo = MMO.getPointerInfo();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, WideMemSizeInBits / 8);
+
+  // Load must be allowed and fast on the target.
+  LLVMContext &C = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  bool Fast = false;
+  if (!getTargetLowering().allowsMemoryAccess(C, DL, Ty, *NewMMO, &Fast) ||
+      !Fast)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &MIB) {
+    Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst;
+    MIB.buildLoad(LoadDst, Ptr, *NewMMO);
+    if (NeedsBSwap)
+      MIB.buildBSwap(Dst, LoadDst);
+  };
+  return true;
+}
+
+bool CombinerHelper::applyLoadOrCombine(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  Builder.setInstrAndDebugLoc(MI);
+  MatchInfo(Builder);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
  if (tryCombineCopy(MI))
    return true;
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@ -1756,6 +1756,14 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                            MMO.getFlags(), Fast);
 }

+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, LLT Ty,
+                                            const MachineMemOperand &MMO,
+                                            bool *Fast) const {
+  return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(),
+                            MMO.getAlign(), MMO.getFlags(), Fast);
+}
+
 BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
  return BranchProbability(MinPercentageForPredictableBranch, 100);
 }
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NOT_STRICT
+# RUN: llc -debugify-and-strip-all-safe -mattr=+strict-align -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=STRICT
+
+# REQUIRES: asserts
+
+# Check that the load-or combine respects alignment requirements.
+...
+---
+name:            misaligned
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; NOT_STRICT-LABEL: name: misaligned
+    ; NOT_STRICT: liveins: $x0, $x1
+    ; NOT_STRICT: %ptr:_(p0) = COPY $x1
+    ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; NOT_STRICT: $w1 = COPY %full_load(s32)
+    ; NOT_STRICT: RET_ReallyLR implicit $w1
+    ; STRICT-LABEL: name: misaligned
+    ; STRICT: liveins: $x0, $x1
+    ; STRICT: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; STRICT: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; STRICT: %ptr:_(p0) = COPY $x1
+    ; STRICT: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; STRICT: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; STRICT: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; STRICT: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; STRICT: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; STRICT: $w1 = COPY %full_load(s32)
+    ; STRICT: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            aligned
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; NOT_STRICT-LABEL: name: aligned
+    ; NOT_STRICT: liveins: $x0, $x1
+    ; NOT_STRICT: %ptr:_(p0) = COPY $x1
+    ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
+    ; NOT_STRICT: $w1 = COPY %full_load(s32)
+    ; NOT_STRICT: RET_ReallyLR implicit $w1
+    ; STRICT-LABEL: name: aligned
+    ; STRICT: liveins: $x0, $x1
+    ; STRICT: %ptr:_(p0) = COPY $x1
+    ; STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
+    ; STRICT: $w1 = COPY %full_load(s32)
+    ; STRICT: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 4)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 4)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir