bpf: new option -bpf-expand-memcpy-in-order to expand memcpy in order

Some BPF JIT backends would want to optimize memcpy in their own architecture specific way. However, at the moment, there is no way for JIT backends to see memcpy semantics in a reliable way. This is due to LLVM BPF backend is expanding memcpy into load/store sequences and could possibly schedule them apart from each other further. So, BPF JIT backends inside kernel can't reliably recognize memcpy semantics by peephole BPF sequence. This patch introduce new intrinsic expand infrastructure to memcpy. To get stable in-order load/store sequence from memcpy, we first lower memcpy into BPF::MEMCPY node which then expanded into in-order load/store sequences in expandPostRAPseudo pass which will happen after instruction scheduling. By this way, kernel JIT backends could reliably recognize memcpy through scanning BPF sequence. This new memcpy expand infrastructure is gated by a new option: -bpf-expand-memcpy-in-order Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com> Signed-off-by: Jiong Wang <jiong.wang@netronome.com> Signed-off-by: Yonghong Song <yhs@fb.com> llvm-svn: 337977
2018-07-25 22:40:02 +00:00 · 2018-07-25 22:40:02 +00:00 · 71d81e5c8f
parent 99ca3c0a61
commit 71d81e5c8f
10 changed files with 371 additions and 8 deletions
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@ -33,6 +33,10 @@ using namespace llvm;

 #define DEBUG_TYPE "bpf-lower"

+static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
+  cl::Hidden, cl::init(false),
+  cl::desc("Expand memcpy into load/store pairs in order"));
+
 static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
  MachineFunction &MF = DAG.getMachineFunction();
  DAG.getContext()->diagnose(
@ -132,10 +136,30 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
  setMinFunctionAlignment(3);
  setPrefFunctionAlignment(3);

-  // inline memcpy() for kernel to see explicit copy
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+  if (BPFExpandMemcpyInOrder) {
+    // LLVM generic code will try to expand memcpy into load/store pairs at this
+    // stage which is before quite a few IR optimization passes, therefore the
+    // loads and stores could potentially be moved apart from each other which
+    // will cause trouble to memcpy pattern matcher inside kernel eBPF JIT
+    // compilers.
+    //
+    // When -bpf-expand-memcpy-in-order specified, we want to defer the expand
+    // of memcpy to later stage in IR optimization pipeline so those load/store
+    // pairs won't be touched and could be kept in order. Hence, we set
+    // MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores
+    // code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy.
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
+  } else {
+    // inline memcpy() for kernel to see explicit copy
+    unsigned CommonMaxStores =
+      STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc();
+
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
+  }

  // CPU/Feature control
  HasAlu32 = STI.getHasAlu32();
@ -518,6 +542,8 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
    return "BPFISD::BR_CC";
  case BPFISD::Wrapper:
    return "BPFISD::Wrapper";
+  case BPFISD::MEMCPY:
+    return "BPFISD::MEMCPY";
  }
  return nullptr;
 }
@ -556,6 +582,37 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
  return PromotedReg2;
 }

+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                     MachineBasicBlock *BB)
+                                                     const {
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB(*MF, MI);
+  unsigned ScratchReg;
+
+  // This function does custom insertion during lowering BPFISD::MEMCPY which
+  // only has two register operands from memcpy semantics, the copy source
+  // address and the copy destination address.
+  //
+  // Because we will expand BPFISD::MEMCPY into load/store pairs, we will need
+  // a third scratch register to serve as the destination register of load and
+  // source register of store.
+  //
+  // The scratch register here is with the Define | Dead | EarlyClobber flags.
+  // The EarlyClobber flag has the semantic property that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction. The Define flag is
+  // needed to coerce the machine verifier that an Undef value isn't a problem
+  // as we anyway is loading memory into it. The Dead flag is needed as the
+  // value in scratch isn't supposed to be used by any other instruction.
+  ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass);
+  MIB.addReg(ScratchReg,
+             RegState::Define | RegState::Dead | RegState::EarlyClobber);
+
+  return BB;
+}
+
 MachineBasicBlock *
 BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                               MachineBasicBlock *BB) const {
@ -567,6 +624,8 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                       Opc == BPF::Select_32 ||
                       Opc == BPF::Select_32_64);

+  bool isMemcpyOp = Opc == BPF::MEMCPY;
+
 #ifndef NDEBUG
  bool isSelectRIOp = (Opc == BPF::Select_Ri ||
                       Opc == BPF::Select_Ri_64_32 ||
@ -574,9 +633,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                       Opc == BPF::Select_Ri_32_64);


-  assert((isSelectRROp || isSelectRIOp) && "Unexpected instr type to insert");
+  assert((isSelectRROp || isSelectRIOp || isMemcpyOp) &&
+         "Unexpected instr type to insert");
 #endif

+  if (isMemcpyOp)
+    return EmitInstrWithCustomInserterMemcpy(MI, BB);
+
  bool is32BitCmp = (Opc == BPF::Select_32 ||
                     Opc == BPF::Select_32_64 ||
                     Opc == BPF::Select_Ri_32 ||
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@ -28,7 +28,8 @@ enum NodeType : unsigned {
  CALL,
  SELECT_CC,
  BR_CC,
-  Wrapper
+  Wrapper,
+  MEMCPY
 };
 }

@ -110,6 +111,11 @@ private:

  unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
                         bool isSigned) const;
+
+  MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                        MachineBasicBlock *BB)
+                                                        const;
+
 };
 }

--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@ -43,6 +43,83 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    llvm_unreachable("Impossible reg-to-reg copy");
 }

+void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  uint64_t CopyLen = MI->getOperand(2).getImm();
+  uint64_t Alignment = MI->getOperand(3).getImm();
+  unsigned ScratchReg = MI->getOperand(4).getReg();
+  MachineBasicBlock *BB = MI->getParent();
+  DebugLoc dl = MI->getDebugLoc();
+  unsigned LdOpc, StOpc;
+
+  switch (Alignment) {
+  case 1:
+    LdOpc = BPF::LDB;
+    StOpc = BPF::STB;
+    break;
+  case 2:
+    LdOpc = BPF::LDH;
+    StOpc = BPF::STH;
+    break;
+  case 4:
+    LdOpc = BPF::LDW;
+    StOpc = BPF::STW;
+    break;
+  case 8:
+    LdOpc = BPF::LDD;
+    StOpc = BPF::STD;
+    break;
+  default:
+    llvm_unreachable("unsupported memcpy alignment");
+  }
+
+  unsigned IterationNum = CopyLen >> Log2_64(Alignment);
+  for(unsigned I = 0; I < IterationNum; ++I) {
+    BuildMI(*BB, MI, dl, get(LdOpc))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(I * Alignment);
+    BuildMI(*BB, MI, dl, get(StOpc))
+            .addReg(ScratchReg).addReg(DstReg).addImm(I * Alignment);
+  }
+
+  unsigned BytesLeft = CopyLen & (Alignment - 1);
+  unsigned Offset = IterationNum * Alignment;
+  bool Hanging4Byte = BytesLeft & 0x4;
+  bool Hanging2Byte = BytesLeft & 0x2;
+  bool Hanging1Byte = BytesLeft & 0x1;
+  if (Hanging4Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDW))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STW))
+            .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+    Offset += 4;
+  }
+  if (Hanging2Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDH))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STH))
+            .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+    Offset += 2;
+  }
+  if (Hanging1Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDB))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STB))
+            .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+  }
+
+  BB->erase(MI);
+}
+
+bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == BPF::MEMCPY) {
+    expandMEMCPY(MI);
+    return true;
+  }
+
+  return false;
+}
+
 void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       unsigned SrcReg, bool IsKill, int FI,
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@ -34,6 +34,8 @@ public:
                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                   bool KillSrc) const override;

+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
  void storeRegToStackSlot(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
                           bool isKill, int FrameIndex,
@ -55,6 +57,9 @@ public:
                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                        const DebugLoc &DL,
                        int *BytesAdded = nullptr) const override;
+private:
+  void expandMEMCPY(MachineBasicBlock::iterator) const;
+
 };
 }

--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@ -28,6 +28,10 @@ def SDT_BPFBrCC         : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
                                               SDTCisVT<3, OtherVT>]>;
 def SDT_BPFWrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                               SDTCisPtrTy<0>]>;
+def SDT_BPFMEMCPY       : SDTypeProfile<0, 4, [SDTCisVT<0, i64>,
+                                               SDTCisVT<1, i64>,
+                                               SDTCisVT<2, i64>,
+                                               SDTCisVT<3, i64>]>;

 def BPFcall         : SDNode<"BPFISD::CALL", SDT_BPFCall,
                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@ -43,6 +47,9 @@ def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,

 def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
 def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+def BPFmemcpy       : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
+                             [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                              SDNPMayStore, SDNPMayLoad]>;
 def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">;
 def BPFIsBigEndian    : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
 def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
@ -714,3 +721,11 @@ let Predicates = [BPFHasALU32] in {
  def : Pat<(i64 (extloadi32 ADDRri:$src)),
            (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
 }
+
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
+    def MEMCPY : Pseudo<
+      (outs),
+      (ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops),
+      "#memcpy dst: $dst, src: $src, len: $len, align: $align",
+      [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
+}
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@ -0,0 +1,43 @@
+//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-selectiondag-info"
+
+SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  // Requires the copy size to be a constant.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  unsigned CopyLen = ConstantSize->getZExtValue();
+  unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
+  // Impose the same copy length limit as MaxStoresPerMemcpy.
+  if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+                    DAG.getConstant(CopyLen, dl, MVT::i64),
+                    DAG.getConstant(Align, dl, MVT::i64));
+
+  return Dst.getValue(0);
+}
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
@ -0,0 +1,36 @@
+//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the BPF subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
+
+};
+
+}
+
+#endif
--- a/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/llvm/lib/Target/BPF/BPFSubtarget.h
@ -17,6 +17,7 @@
 #include "BPFFrameLowering.h"
 #include "BPFISelLowering.h"
 #include "BPFInstrInfo.h"
+#include "BPFSelectionDAGInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@ -33,7 +34,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
  BPFInstrInfo InstrInfo;
  BPFFrameLowering FrameLowering;
  BPFTargetLowering TLInfo;
-  SelectionDAGTargetInfo TSInfo;
+  BPFSelectionDAGInfo TSInfo;

 private:
  void initializeEnvironment();
@ -75,7 +76,7 @@ public:
  const BPFTargetLowering *getTargetLowering() const override {
    return &TLInfo;
  }
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+  const BPFSelectionDAGInfo *getSelectionDAGInfo() const override {
    return &TSInfo;
  }
  const TargetRegisterInfo *getRegisterInfo() const override {
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@ -20,6 +20,7 @@ add_llvm_target(BPFCodeGen
  BPFISelLowering.cpp
  BPFMCInstLower.cpp
  BPFRegisterInfo.cpp
+  BPFSelectionDAGInfo.cpp
  BPFSubtarget.cpp
  BPFTargetMachine.cpp
  BPFMIPeephole.cpp
--- a/llvm/test/CodeGen/BPF/memcpy-expand-in-order.ll
+++ b/llvm/test/CodeGen/BPF/memcpy-expand-in-order.ll
@ -0,0 +1,116 @@
+; RUN: llc < %s -march=bpfel -bpf-expand-memcpy-in-order | FileCheck %s
+; RUN: llc < %s -march=bpfeb -bpf-expand-memcpy-in-order | FileCheck %s
+;
+; #define COPY_LEN	9
+;
+; void cal_align1(void *a, void *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; void cal_align2(short *a, short *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; #undef COPY_LEN
+; #define COPY_LEN	19
+; void cal_align4(int *a, int *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; #undef COPY_LEN
+; #define COPY_LEN	27
+; void cal_align8(long long *a, long long *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+
+; Function Attrs: nounwind
+define dso_local void @cal_align1(i8* nocapture %a, i8* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 9, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
+
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u8 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u8 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 1)
+; CHECK: *(u8 *)([[DST_REG]] + 1) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 2)
+; CHECK: *(u8 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 3)
+; CHECK: *(u8 *)([[DST_REG]] + 3) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 4)
+; CHECK: *(u8 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 5)
+; CHECK: *(u8 *)([[DST_REG]] + 5) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 6)
+; CHECK: *(u8 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 7)
+; CHECK: *(u8 *)([[DST_REG]] + 7) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
+; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align2(i16* nocapture %a, i16* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i16* %a to i8*
+  %1 = bitcast i16* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 %0, i8* align 2 %1, i64 9, i1 false)
+  ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u16 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u16 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 2)
+; CHECK: *(u16 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 4)
+; CHECK: *(u16 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 6)
+; CHECK: *(u16 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
+; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align4(i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast i32* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 19, i1 false)
+  ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u32 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u32 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 4)
+; CHECK: *(u32 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 8)
+; CHECK: *(u32 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 12)
+; CHECK: *(u32 *)([[DST_REG]] + 12) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 16)
+; CHECK: *(u16 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 18)
+; CHECK: *(u8 *)([[DST_REG]] + 18) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align8(i64* nocapture %a, i64* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i64* %a to i8*
+  %1 = bitcast i64* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 27, i1 false)
+  ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u64 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u64 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 8)
+; CHECK: *(u64 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 16)
+; CHECK: *(u64 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 24)
+; CHECK: *(u16 *)([[DST_REG]] + 24) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 26)
+; CHECK: *(u8 *)([[DST_REG]] + 26) = [[SCRATCH_REG]]