forked from OSchip/llvm-project
Turn a memcpy from a double* into a load/store of double instead of
a load/store of i64. The later prevents promotion/scalarrepl of the source and dest in many cases. This fixes the 300% performance regression of the byval stuff on stepanov_v1p2. llvm-svn: 45945
This commit is contained in:
parent
57974c8d51
commit
92bd785323
|
@ -7826,16 +7826,49 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
|
||||||
ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getOperand(3));
|
ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getOperand(3));
|
||||||
if (MemOpLength == 0) return 0;
|
if (MemOpLength == 0) return 0;
|
||||||
|
|
||||||
// Source and destination pointer types are always "i8*" for intrinsic.
|
// Source and destination pointer types are always "i8*" for intrinsic. See
|
||||||
// If Size is 8 then use Int64Ty
|
// if the size is something we can handle with a single primitive load/store.
|
||||||
// If Size is 4 then use Int32Ty
|
// A single load+store correctly handles overlapping memory in the memmove
|
||||||
// If Size is 2 then use Int16Ty
|
// case.
|
||||||
// If Size is 1 then use Int8Ty
|
|
||||||
unsigned Size = MemOpLength->getZExtValue();
|
unsigned Size = MemOpLength->getZExtValue();
|
||||||
if (Size == 0 || Size > 8 || (Size&(Size-1)))
|
if (Size == 0 || Size > 8 || (Size&(Size-1)))
|
||||||
return 0; // If not 1/2/4/8, exit.
|
return 0; // If not 1/2/4/8 bytes, exit.
|
||||||
|
|
||||||
|
// Use an integer load+store unless we can find something better.
|
||||||
Type *NewPtrTy = PointerType::getUnqual(IntegerType::get(Size<<3));
|
Type *NewPtrTy = PointerType::getUnqual(IntegerType::get(Size<<3));
|
||||||
|
|
||||||
|
// Memcpy forces the use of i8* for the source and destination. That means
|
||||||
|
// that if you're using memcpy to move one double around, you'll get a cast
|
||||||
|
// from double* to i8*. We'd much rather use a double load+store rather than
|
||||||
|
// an i64 load+store, here because this improves the odds that the source or
|
||||||
|
// dest address will be promotable. See if we can find a better type than the
|
||||||
|
// integer datatype.
|
||||||
|
if (Value *Op = getBitCastOperand(MI->getOperand(1))) {
|
||||||
|
const Type *SrcETy = cast<PointerType>(Op->getType())->getElementType();
|
||||||
|
if (SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
|
||||||
|
// The SrcETy might be something like {{{double}}} or [1 x double]. Rip
|
||||||
|
// down through these levels if so.
|
||||||
|
while (!SrcETy->isFirstClassType()) {
|
||||||
|
if (const StructType *STy = dyn_cast<StructType>(SrcETy)) {
|
||||||
|
if (STy->getNumElements() == 1)
|
||||||
|
SrcETy = STy->getElementType(0);
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
} else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
|
||||||
|
if (ATy->getNumElements() == 1)
|
||||||
|
SrcETy = ATy->getElementType();
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
} else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SrcETy->isFirstClassType())
|
||||||
|
NewPtrTy = PointerType::getUnqual(SrcETy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// If the memcpy/memmove provides better alignment info than we can
|
// If the memcpy/memmove provides better alignment info than we can
|
||||||
// infer, use it.
|
// infer, use it.
|
||||||
SrcAlign = std::max(SrcAlign, CopyAlign);
|
SrcAlign = std::max(SrcAlign, CopyAlign);
|
||||||
|
@ -7843,9 +7876,13 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
|
||||||
|
|
||||||
Value *Src = InsertBitCastBefore(MI->getOperand(2), NewPtrTy, *MI);
|
Value *Src = InsertBitCastBefore(MI->getOperand(2), NewPtrTy, *MI);
|
||||||
Value *Dest = InsertBitCastBefore(MI->getOperand(1), NewPtrTy, *MI);
|
Value *Dest = InsertBitCastBefore(MI->getOperand(1), NewPtrTy, *MI);
|
||||||
Value *L = new LoadInst(Src, "tmp", false, SrcAlign, MI);
|
Instruction *L = new LoadInst(Src, "tmp", false, SrcAlign);
|
||||||
new StoreInst(L, Dest, false, DstAlign, MI);
|
InsertNewInstBefore(L, *MI);
|
||||||
return EraseInstFromFunction(*MI);
|
InsertNewInstBefore(new StoreInst(L, Dest, false, DstAlign), *MI);
|
||||||
|
|
||||||
|
// Set the size of the copy to 0, it will be deleted on the next iteration.
|
||||||
|
MI->setOperand(3, Constant::getNullValue(MemOpLength->getType()));
|
||||||
|
return MI;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// visitCallInst - CallInst simplification. This mostly only handles folding
|
/// visitCallInst - CallInst simplification. This mostly only handles folding
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
; RUN: llvm-as < %s | opt -instcombine | llvm-dis | grep {load double}
|
||||||
|
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
|
||||||
|
target triple = "i686-apple-darwin8"
|
||||||
|
|
||||||
|
define void @foo(double* %X, double* %Y) {
|
||||||
|
entry:
|
||||||
|
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
||||||
|
%tmp2 = bitcast double* %X to i8* ; <i8*> [#uses=1]
|
||||||
|
%tmp13 = bitcast double* %Y to i8* ; <i8*> [#uses=1]
|
||||||
|
call void @llvm.memcpy.i32( i8* %tmp2, i8* %tmp13, i32 8, i32 1 )
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
|
Loading…
Reference in New Issue