The memcpy optimizer was happily doing call slot forwarding when the new memory

was less aligned than the old. In the testcase this results in an overaligned memset: the memset alignment was correct for the original memory but is too much for the new memory. Fix this by either increasing the alignment of the new memory or bailing out if that isn't possible. Should fix the gcc-4.7 self-host buildbot failure. llvm-svn: 165220
2012-10-04 10:54:40 +00:00 · 2012-10-04 10:54:40 +00:00 · c6ada69a14
parent 43c8b46deb
commit c6ada69a14
2 changed files with 51 additions and 7 deletions
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@ -332,7 +332,7 @@ namespace {
    bool processMemCpy(MemCpyInst *M);
    bool processMemMove(MemMoveInst *M);
    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                              uint64_t cpyLen, CallInst *C);
+                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                       uint64_t MSize);
    bool processByValArgument(CallSite CS, unsigned ArgNo);
@ -509,10 +509,18 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
      }

      if (C) {
+        unsigned storeAlign = SI->getAlignment();
+        if (!storeAlign)
+          storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType());
+        unsigned loadAlign = LI->getAlignment();
+        if (!loadAlign)
+          loadAlign = TD->getABITypeAlignment(LI->getType());
+
        bool changed = performCallSlotOptzn(LI,
                        SI->getPointerOperand()->stripPointerCasts(),
                        LI->getPointerOperand()->stripPointerCasts(),
-                        TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
+                        TD->getTypeStoreSize(SI->getOperand(0)->getType()),
+                        std::min(storeAlign, loadAlign), C);
        if (changed) {
          MD->removeInstruction(SI);
          SI->eraseFromParent();
@ -559,7 +567,8 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 /// the call write its result directly into the destination of the memcpy.
 bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
                                     Value *cpyDest, Value *cpySrc,
-                                     uint64_t cpyLen, CallInst *C) {
+                                     uint64_t cpyLen, unsigned cpyAlign,
+                                     CallInst *C) {
  // The general transformation to keep in mind is
  //
  //   call @func(..., src, ...)
@ -596,6 +605,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
  if (cpyLen < srcSize)
    return false;

+  // Check that dest points to memory that is at least as aligned as src.
+  unsigned srcAlign = srcAlloca->getAlignment();
+  if (!srcAlign)
+    srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType());
+  bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+  // If dest is not aligned enough and we can't increase its alignment then
+  // bail out.
+  if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+    return false;
+
  // Check that accessing the first srcSize bytes of dest will not cause a
  // trap.  Otherwise the transform is invalid since it might cause a trap
  // to occur earlier than it otherwise would.
@ -687,6 +706,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
  if (!changedArgument)
    return false;

+  // If the destination wasn't sufficiently aligned then increase its alignment.
+  if (!isDestSufficientlyAligned) {
+    assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+  }
+
  // Drop any cached information about the call, because we may have changed
  // its dependence information by changing its parameter.
  MD->removeInstruction(C);
@ -813,7 +838,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
  if (DepInfo.isClobber()) {
    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), C)) {
+                               CopySize->getZExtValue(), M->getAlignment(),
+                               C)) {
        MD->removeInstruction(M);
        M->eraseFromParent();
        return true;
--- a/llvm/test/Transforms/MemCpyOpt/align.ll
+++ b/llvm/test/Transforms/MemCpyOpt/align.ll
@ -1,12 +1,15 @@
-; RUN: opt < %s -S -memcpyopt | FileCheck %s
+; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"

+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
 ; The resulting memset is only 4-byte aligned, despite containing
 ; a 16-byte aligned store in the middle.

-; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
-
 define void @foo(i32* %p) {
+; CHECK: @foo
+; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
  %a0 = getelementptr i32* %p, i64 0
  store i32 0, i32* %a0, align 4
  %a1 = getelementptr i32* %p, i64 1
@ -17,3 +20,18 @@ define void @foo(i32* %p) {
  store i32 0, i32* %a3, align 4
  ret void
 }
+
+; Replacing %a8 with %a4 in the memset requires boosting the alignment of %a4.
+
+define void @bar() {
+; CHECK: @bar
+; CHECK: %a4 = alloca i32, align 8
+; CHECK-NOT: memcpy
+  %a4 = alloca i32, align 4
+  %a8 = alloca i32, align 8
+  %a8.cast = bitcast i32* %a8 to i8*
+  %a4.cast = bitcast i32* %a4 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %a8.cast, i8 0, i64 4, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a4.cast, i8* %a8.cast, i64 4, i32 4, i1 false)
+  ret void
+}