diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 35acc7923879..68be3b7c77fd 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -332,7 +332,7 @@ namespace { bool processMemCpy(MemCpyInst *M); bool processMemMove(MemMoveInst *M); bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, - uint64_t cpyLen, CallInst *C); + uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); bool processByValArgument(CallSite CS, unsigned ArgNo); @@ -509,10 +509,18 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } if (C) { + unsigned storeAlign = SI->getAlignment(); + if (!storeAlign) + storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); + unsigned loadAlign = LI->getAlignment(); + if (!loadAlign) + loadAlign = TD->getABITypeAlignment(LI->getType()); + bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), - TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); + TD->getTypeStoreSize(SI->getOperand(0)->getType()), + std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -559,7 +567,8 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { /// the call write its result directly into the destination of the memcpy. bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, Value *cpySrc, - uint64_t cpyLen, CallInst *C) { + uint64_t cpyLen, unsigned cpyAlign, + CallInst *C) { // The general transformation to keep in mind is // // call @func(..., src, ...) @@ -596,6 +605,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (cpyLen < srcSize) return false; + // Check that dest points to memory that is at least as aligned as src. + unsigned srcAlign = srcAlloca->getAlignment(); + if (!srcAlign) + srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType()); + bool isDestSufficientlyAligned = srcAlign <= cpyAlign; + // If dest is not aligned enough and we can't increase its alignment then + // bail out. + if (!isDestSufficientlyAligned && !isa(cpyDest)) + return false; + // Check that accessing the first srcSize bytes of dest will not cause a // trap. Otherwise the transform is invalid since it might cause a trap // to occur earlier than it otherwise would. @@ -687,6 +706,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!changedArgument) return false; + // If the destination wasn't sufficiently aligned then increase its alignment. + if (!isDestSufficientlyAligned) { + assert(isa(cpyDest) && "Can only increase alloca alignment!"); + cast(cpyDest)->setAlignment(srcAlign); + } + // Drop any cached information about the call, because we may have changed // its dependence information by changing its parameter. MD->removeInstruction(C); @@ -813,7 +838,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), C)) { + CopySize->getZExtValue(), M->getAlignment(), + C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; diff --git a/llvm/test/Transforms/MemCpyOpt/align.ll b/llvm/test/Transforms/MemCpyOpt/align.ll index b1f900d9da4c..1b98f6ad383f 100644 --- a/llvm/test/Transforms/MemCpyOpt/align.ll +++ b/llvm/test/Transforms/MemCpyOpt/align.ll @@ -1,12 +1,15 @@ -; RUN: opt < %s -S -memcpyopt | FileCheck %s +; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind + ; The resulting memset is only 4-byte aligned, despite containing ; a 16-byte aligned store in the middle. -; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false) - define void @foo(i32* %p) { +; CHECK: @foo +; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false) %a0 = getelementptr i32* %p, i64 0 store i32 0, i32* %a0, align 4 %a1 = getelementptr i32* %p, i64 1 @@ -17,3 +20,18 @@ define void @foo(i32* %p) { store i32 0, i32* %a3, align 4 ret void } + +; Replacing %a8 with %a4 in the memset requires boosting the alignment of %a4. + +define void @bar() { +; CHECK: @bar +; CHECK: %a4 = alloca i32, align 8 +; CHECK-NOT: memcpy + %a4 = alloca i32, align 4 + %a8 = alloca i32, align 8 + %a8.cast = bitcast i32* %a8 to i8* + %a4.cast = bitcast i32* %a4 to i8* + call void @llvm.memset.p0i8.i64(i8* %a8.cast, i8 0, i64 4, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a4.cast, i8* %a8.cast, i64 4, i32 4, i1 false) + ret void +}