[Intrinsics] Make MemCpyInlineInst a MemCpyInst

This opens up more optimization opportunities in passes that already handle MemCpyInst's. Differential revision: https://reviews.llvm.org/D105247
2021-06-30 16:49:48 -07:00 · 2021-06-30 16:49:48 -07:00 · 37b6e03c18
parent 2ff5a56e1a
commit 37b6e03c18
10 changed files with 178 additions and 27 deletions
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@ -647,8 +647,11 @@ public:
                                 TBAAStructTag, ScopeTag, NoAliasTag);
  }

-  CallInst *CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src,
-                               MaybeAlign SrcAlign, Value *Size);
+  CallInst *
+  CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src,
+                     MaybeAlign SrcAlign, Value *Size, bool IsVolatile = false,
+                     MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr,
+                     MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);

  /// Create and insert an element unordered-atomic memcpy between the
  /// specified pointers.
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@ -209,6 +209,9 @@ public:
  RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
  RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
  RetTy visitMemCpyInst(MemCpyInst &I)            { DELEGATE(MemTransferInst); }
+  RetTy visitMemCpyInlineInst(MemCpyInlineInst &I) {
+    DELEGATE(MemTransferInst);
+  }
  RetTy visitMemMoveInst(MemMoveInst &I)          { DELEGATE(MemTransferInst); }
  RetTy visitMemTransferInst(MemTransferInst &I)  { DELEGATE(MemIntrinsic); }
  RetTy visitMemIntrinsic(MemIntrinsic &I)        { DELEGATE(IntrinsicInst); }
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@ -915,7 +915,8 @@ class MemCpyInst : public MemTransferInst {
 public:
  // Methods for support type inquiry through isa, cast, and dyn_cast:
  static bool classof(const IntrinsicInst *I) {
-    return I->getIntrinsicID() == Intrinsic::memcpy;
+    return I->getIntrinsicID() == Intrinsic::memcpy ||
+           I->getIntrinsicID() == Intrinsic::memcpy_inline;
  }
  static bool classof(const Value *V) {
    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
@ -935,10 +936,10 @@ public:
 };

 /// This class wraps the llvm.memcpy.inline intrinsic.
-class MemCpyInlineInst : public MemTransferInst {
+class MemCpyInlineInst : public MemCpyInst {
 public:
  ConstantInt *getLength() const {
-    return cast<ConstantInt>(MemTransferInst::getLength());
+    return cast<ConstantInt>(MemCpyInst::getLength());
  }
  // Methods for support type inquiry through isa, cast, and dyn_cast:
  static bool classof(const IntrinsicInst *I) {
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@ -203,14 +203,14 @@ CallInst *IRBuilderBase::CreateMemTransferInst(
  return CI;
 }

-CallInst *IRBuilderBase::CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign,
-                                            Value *Src, MaybeAlign SrcAlign,
-                                            Value *Size) {
+CallInst *IRBuilderBase::CreateMemCpyInline(
+    Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign,
+    Value *Size, bool IsVolatile, MDNode *TBAATag, MDNode *TBAAStructTag,
+    MDNode *ScopeTag, MDNode *NoAliasTag) {
  Dst = getCastedInt8PtrValue(Dst);
  Src = getCastedInt8PtrValue(Src);
-  Value *IsVolatile = getInt1(false);

-  Value *Ops[] = {Dst, Src, Size, IsVolatile};
+  Value *Ops[] = {Dst, Src, Size, getInt1(IsVolatile)};
  Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
  Function *F = BB->getParent();
  Module *M = F->getParent();
@ -224,6 +224,20 @@ CallInst *IRBuilderBase::CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign,
  if (SrcAlign)
    MCI->setSourceAlignment(*SrcAlign);

+  // Set the TBAA info if present.
+  if (TBAATag)
+    MCI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  // Set the TBAA Struct info if present.
+  if (TBAAStructTag)
+    MCI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
+
+  if (ScopeTag)
+    MCI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+  if (NoAliasTag)
+    MCI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
  return CI;
 }

--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@ -953,7 +953,13 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
    if (Dest == OldV)
      Dest = NewV;

-    if (isa<MemCpyInst>(MTI)) {
+    if (isa<MemCpyInlineInst>(MTI)) {
+      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+      B.CreateMemCpyInline(Dest, MTI->getDestAlign(), Src,
+                           MTI->getSourceAlign(), MTI->getLength(),
+                           false, // isVolatile
+                           TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+    } else if (isa<MemCpyInst>(MTI)) {
      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
      B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
                     MTI->getLength(),
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@ -831,7 +831,7 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
    return false;

  // If we're not allowed to hack on memcpy, we fail.
-  if (!HasMemcpy || DisableLIRP::Memcpy)
+  if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
    return false;

  Value *Dest = MCI->getDest();
@ -1190,6 +1190,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
    MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
    const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
    const SCEV *BECount) {
+
+  // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
+  // conservatively bail here, since otherwise we may have to transform
+  // llvm.memcpy.inline into llvm.memcpy which is illegal.
+  if (isa<MemCpyInlineInst>(TheStore))
+    return false;
+
  // The trip count of the loop and the base pointer of the addrec SCEV is
  // guaranteed to be loop invariant, which means that it should dominate the
  // header.  This allows us to insert code for it in the preheader.
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@ -691,7 +691,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {

        // We found an instruction that may write to the loaded memory.
        // We can try to promote at this position instead of the store
-        // position if nothing alias the store memory after this and the store
+        // position if nothing aliases the store memory after this and the store
        // destination is not in the range.
        if (P && P != SI) {
          if (!moveUp(SI, P, LI))
@ -1109,7 +1109,14 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
    NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
                                 MDep->getRawSource(), MDep->getSourceAlign(),
                                 M->getLength(), M->isVolatile());
-  else
+  else if (isa<MemCpyInlineInst>(M)) {
+    // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
+    // never allowed since that would allow the latter to be lowered as a call
+    // to an external function.
+    NewM = Builder.CreateMemCpyInline(
+        M->getRawDest(), M->getDestAlign(), MDep->getRawSource(),
+        MDep->getSourceAlign(), M->getLength(), M->isVolatile());
+  } else
    NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
                                MDep->getRawSource(), MDep->getSourceAlign(),
                                M->getLength(), M->isVolatile());
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@ -40,6 +40,14 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8* %dest,
  ret void
 }

+; CHECK-LABEL: @memcpy_inline_flat_to_flat_replace_src_with_group(
+; CHECK: call void @llvm.memcpy.inline.p0i8.p3i8.i64(i8* align 4 %dest, i8 addrspace(3)* align 4 %src.group.ptr, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8*
+  call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 4 %dest, i8* align 4 %cast.src, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group(
 ; CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 4 %dest.group.ptr, i8* align 4 %src.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8* %src.ptr, i64 %size) #0 {
@ -118,6 +126,7 @@ define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8* %dest

 declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
+declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
 declare void @llvm.memcpy.p0i8.p3i8.i32(i8* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1

--- a/llvm/test/Transforms/LoopIdiom/memcpy-inline-intrinsic.ll
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-inline-intrinsic.ll
@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+
+%struct.S = type { i32, i32, i8 }
+
+; unsigned copy_noalias(S* __restrict a, S *b, int n) {
+;   for (int i = 0; i < n; i++) {
+;     a[i] = b[i];
+;   }
+;   return sizeof(a[0]);
+; }
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 12
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false)
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret i32 12
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idxprom = zext i32 %i.08 to i64
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
+  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
+  %0 = bitcast %struct.S* %arrayidx2 to i8*
+  %1 = bitcast %struct.S* %arrayidx to i8*
+  call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
+  %inc = add nuw nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@ -8,6 +8,14 @@ target triple = "i686-apple-darwin9"
 %0 = type { x86_fp80, x86_fp80 }
 %1 = type { i32, i32 }

+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
 define void @test1(%0* sret(%0)  %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind  {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
@ -71,7 +79,49 @@ define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind  {

 }

+; Same as @test2_memcpy, but the remaining memcpy should remain non-inline even
+; if the one eliminated was inline.
+define void @test3_memcpy(i8* noalias %P, i8* noalias %Q) nounwind  {
+; CHECK-LABEL: @test3_memcpy(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca %0, align 16
+  %R = bitcast %0* %memtmp to i8*
+  call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
+  ret void

+}
+
+; Same as @test2_memcpy, but the remaining memcpy should remain inline even
+; if the one eliminated was not inline.
+define void @test4_memcpy(i8* noalias %P, i8* noalias %Q) nounwind  {
+; CHECK-LABEL: @test4_memcpy(
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca %0, align 16
+  %R = bitcast %0* %memtmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
+  call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
+  ret void
+
+}
+
+; Same as @test2_memcpy, and the inline-ness should be preserved.
+define void @test5_memcpy(i8* noalias %P, i8* noalias %Q) nounwind  {
+; CHECK-LABEL: @test5_memcpy(
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca %0, align 16
+  %R = bitcast %0* %memtmp to i8*
+  call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
+  call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
+  ret void
+
+}


@x = external global %0
@ -202,9 +252,6 @@ exit:
 }

 declare void @test4a(i8* align 1 byval(i8))
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
-declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
-declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind

 %struct.S = type { i128, [4 x i8]}

@ -266,7 +313,6 @@ entry:

 declare i32 @g(%struct.p* align 8 byval(%struct.p))

-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind

 ; PR11142 - When looking for a memcpy-memcpy dependency, don't get stuck on
 ; instructions between the memcpy's that only affect the destination pointer.
@ -375,14 +421,5 @@ define void @test11([20 x i32] addrspace(1)* nocapture dereferenceable(80) %P) {
  ret void
 }

-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
-declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind
-
 declare void @f1(%struct.big* nocapture sret(%struct.big))
 declare void @f2(%struct.big*)
-
-; CHECK: attributes #1 = { argmemonly nofree nounwind willreturn }
-; CHECK: attributes #2 = { nounwind ssp }
-; CHECK: attributes #3 = { willreturn }
-; CHECK: attributes #4 = { nounwind ssp uwtable }
-; CHECK: attributes #5 = { argmemonly nofree nounwind willreturn writeonly }