[Intrinsics] Make MemCpyInlineInst a MemCpyInst

This opens up more optimization opportunities in passes that already handle MemCpyInst's.

Differential revision: https://reviews.llvm.org/D105247
This commit is contained in:
Jon Roelofs 2021-06-30 16:49:48 -07:00
parent 2ff5a56e1a
commit 37b6e03c18
10 changed files with 178 additions and 27 deletions

View File

@ -647,8 +647,11 @@ public:
TBAAStructTag, ScopeTag, NoAliasTag);
}
CallInst *CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src,
MaybeAlign SrcAlign, Value *Size);
CallInst *
CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src,
MaybeAlign SrcAlign, Value *Size, bool IsVolatile = false,
MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr,
MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
/// Create and insert an element unordered-atomic memcpy between the
/// specified pointers.

View File

@ -209,6 +209,9 @@ public:
RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
RetTy visitMemSetInst(MemSetInst &I) { DELEGATE(MemIntrinsic); }
RetTy visitMemCpyInst(MemCpyInst &I) { DELEGATE(MemTransferInst); }
RetTy visitMemCpyInlineInst(MemCpyInlineInst &I) {
DELEGATE(MemTransferInst);
}
RetTy visitMemMoveInst(MemMoveInst &I) { DELEGATE(MemTransferInst); }
RetTy visitMemTransferInst(MemTransferInst &I) { DELEGATE(MemIntrinsic); }
RetTy visitMemIntrinsic(MemIntrinsic &I) { DELEGATE(IntrinsicInst); }

View File

@ -915,7 +915,8 @@ class MemCpyInst : public MemTransferInst {
public:
// Methods for support type inquiry through isa, cast, and dyn_cast:
static bool classof(const IntrinsicInst *I) {
return I->getIntrinsicID() == Intrinsic::memcpy;
return I->getIntrinsicID() == Intrinsic::memcpy ||
I->getIntrinsicID() == Intrinsic::memcpy_inline;
}
static bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
@ -935,10 +936,10 @@ public:
};
/// This class wraps the llvm.memcpy.inline intrinsic.
class MemCpyInlineInst : public MemTransferInst {
class MemCpyInlineInst : public MemCpyInst {
public:
ConstantInt *getLength() const {
return cast<ConstantInt>(MemTransferInst::getLength());
return cast<ConstantInt>(MemCpyInst::getLength());
}
// Methods for support type inquiry through isa, cast, and dyn_cast:
static bool classof(const IntrinsicInst *I) {

View File

@ -203,14 +203,14 @@ CallInst *IRBuilderBase::CreateMemTransferInst(
return CI;
}
CallInst *IRBuilderBase::CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign,
Value *Src, MaybeAlign SrcAlign,
Value *Size) {
CallInst *IRBuilderBase::CreateMemCpyInline(
Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign,
Value *Size, bool IsVolatile, MDNode *TBAATag, MDNode *TBAAStructTag,
MDNode *ScopeTag, MDNode *NoAliasTag) {
Dst = getCastedInt8PtrValue(Dst);
Src = getCastedInt8PtrValue(Src);
Value *IsVolatile = getInt1(false);
Value *Ops[] = {Dst, Src, Size, IsVolatile};
Value *Ops[] = {Dst, Src, Size, getInt1(IsVolatile)};
Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
Function *F = BB->getParent();
Module *M = F->getParent();
@ -224,6 +224,20 @@ CallInst *IRBuilderBase::CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign,
if (SrcAlign)
MCI->setSourceAlignment(*SrcAlign);
// Set the TBAA info if present.
if (TBAATag)
MCI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
// Set the TBAA Struct info if present.
if (TBAAStructTag)
MCI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
if (ScopeTag)
MCI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
if (NoAliasTag)
MCI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
return CI;
}

View File

@ -953,7 +953,13 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
if (Dest == OldV)
Dest = NewV;
if (isa<MemCpyInst>(MTI)) {
if (isa<MemCpyInlineInst>(MTI)) {
MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
B.CreateMemCpyInline(Dest, MTI->getDestAlign(), Src,
MTI->getSourceAlign(), MTI->getLength(),
false, // isVolatile
TBAA, TBAAStruct, ScopeMD, NoAliasMD);
} else if (isa<MemCpyInst>(MTI)) {
MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
MTI->getLength(),

View File

@ -831,7 +831,7 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
return false;
// If we're not allowed to hack on memcpy, we fail.
if (!HasMemcpy || DisableLIRP::Memcpy)
if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
return false;
Value *Dest = MCI->getDest();
@ -1190,6 +1190,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
const SCEV *BECount) {
// FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
// conservatively bail here, since otherwise we may have to transform
// llvm.memcpy.inline into llvm.memcpy which is illegal.
if (isa<MemCpyInlineInst>(TheStore))
return false;
// The trip count of the loop and the base pointer of the addrec SCEV is
// guaranteed to be loop invariant, which means that it should dominate the
// header. This allows us to insert code for it in the preheader.

View File

@ -691,7 +691,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// We found an instruction that may write to the loaded memory.
// We can try to promote at this position instead of the store
// position if nothing alias the store memory after this and the store
// position if nothing aliases the store memory after this and the store
// destination is not in the range.
if (P && P != SI) {
if (!moveUp(SI, P, LI))
@ -1109,7 +1109,14 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
MDep->getRawSource(), MDep->getSourceAlign(),
M->getLength(), M->isVolatile());
else
else if (isa<MemCpyInlineInst>(M)) {
// llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
// never allowed since that would allow the latter to be lowered as a call
// to an external function.
NewM = Builder.CreateMemCpyInline(
M->getRawDest(), M->getDestAlign(), MDep->getRawSource(),
MDep->getSourceAlign(), M->getLength(), M->isVolatile());
} else
NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
MDep->getRawSource(), MDep->getSourceAlign(),
M->getLength(), M->isVolatile());

View File

@ -40,6 +40,14 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8* %dest,
ret void
}
; CHECK-LABEL: @memcpy_inline_flat_to_flat_replace_src_with_group(
; CHECK: call void @llvm.memcpy.inline.p0i8.p3i8.i64(i8* align 4 %dest, i8 addrspace(3)* align 4 %src.group.ptr, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr) #0 {
%cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8*
call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 4 %dest, i8* align 4 %cast.src, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
ret void
}
; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group(
; CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 4 %dest.group.ptr, i8* align 4 %src.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8* %src.ptr, i64 %size) #0 {
@ -118,6 +126,7 @@ define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8* %dest
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
declare void @llvm.memcpy.p0i8.p3i8.i32(i8* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i1) #1
declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1

View File

@ -0,0 +1,64 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-idiom < %s -S | FileCheck %s
%struct.S = type { i32, i32, i8 }
; unsigned copy_noalias(S* __restrict a, S *b, int n) {
; for (int i = 0; i < n; i++) {
; a[i] = b[i];
; }
; return sizeof(a[0]);
; }
; Function Attrs: nofree nounwind uwtable mustprogress
define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
; CHECK-LABEL: @copy_noalias(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 12
; CHECK: for.body:
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B:%.*]], i64 [[IDXPROM]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A:%.*]], i64 [[IDXPROM]]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false)
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
;
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret i32 12
for.body: ; preds = %for.body.preheader, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%idxprom = zext i32 %i.08 to i64
%arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
%arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
%0 = bitcast %struct.S* %arrayidx2 to i8*
%1 = bitcast %struct.S* %arrayidx to i8*
call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
%inc = add nuw nsw i32 %i.08, 1
%cmp = icmp slt i32 %inc, %n
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1

View File

@ -8,6 +8,14 @@ target triple = "i686-apple-darwin9"
%0 = type { x86_fp80, x86_fp80 }
%1 = type { i32, i32 }
declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind
declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
declare void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
define void @test1(%0* sret(%0) %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:
@ -71,7 +79,49 @@ define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind {
}
; Same as @test2_memcpy, but the remaining memcpy should remain non-inline even
; if the one eliminated was inline.
define void @test3_memcpy(i8* noalias %P, i8* noalias %Q) nounwind {
; CHECK-LABEL: @test3_memcpy(
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
; CHECK-NEXT: ret void
;
%memtmp = alloca %0, align 16
%R = bitcast %0* %memtmp to i8*
call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
ret void
}
; Same as @test2_memcpy, but the remaining memcpy should remain inline even
; if the one eliminated was not inline.
define void @test4_memcpy(i8* noalias %P, i8* noalias %Q) nounwind {
; CHECK-LABEL: @test4_memcpy(
; CHECK-NEXT: call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
; CHECK-NEXT: ret void
;
%memtmp = alloca %0, align 16
%R = bitcast %0* %memtmp to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
ret void
}
; Same as @test2_memcpy, and the inline-ness should be preserved.
define void @test5_memcpy(i8* noalias %P, i8* noalias %Q) nounwind {
; CHECK-LABEL: @test5_memcpy(
; CHECK-NEXT: call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
; CHECK-NEXT: ret void
;
%memtmp = alloca %0, align 16
%R = bitcast %0* %memtmp to i8*
call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
ret void
}
@x = external global %0
@ -202,9 +252,6 @@ exit:
}
declare void @test4a(i8* align 1 byval(i8))
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
%struct.S = type { i128, [4 x i8]}
@ -266,7 +313,6 @@ entry:
declare i32 @g(%struct.p* align 8 byval(%struct.p))
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind
; PR11142 - When looking for a memcpy-memcpy dependency, don't get stuck on
; instructions between the memcpy's that only affect the destination pointer.
@ -375,14 +421,5 @@ define void @test11([20 x i32] addrspace(1)* nocapture dereferenceable(80) %P) {
ret void
}
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind
declare void @f1(%struct.big* nocapture sret(%struct.big))
declare void @f2(%struct.big*)
; CHECK: attributes #1 = { argmemonly nofree nounwind willreturn }
; CHECK: attributes #2 = { nounwind ssp }
; CHECK: attributes #3 = { willreturn }
; CHECK: attributes #4 = { nounwind ssp uwtable }
; CHECK: attributes #5 = { argmemonly nofree nounwind willreturn writeonly }