forked from OSchip/llvm-project
Reapply: [LIR] Add support for creating memsets from loops with a negative stride.
The simple fix is to prevent forming memcpy from loops with a negative stride. llvm-svn: 251518
This commit is contained in:
parent
ef607a2089
commit
7967614b2b
|
@ -31,11 +31,6 @@
|
|||
// void foo(_Complex float *P)
|
||||
// for (i) { __real__(*P) = 0; __imag__(*P) = 0; }
|
||||
//
|
||||
// We should enhance this to handle negative strides through memory.
|
||||
// Alternatively (and perhaps better) we could rely on an earlier pass to force
|
||||
// forward iteration through memory, which is generally better for cache
|
||||
// behavior. Negative strides *do* happen for memset/memcpy loops.
|
||||
//
|
||||
// This could recognize common matrix multiplies and dot product idioms and
|
||||
// replace them with calls to BLAS (if linked in??).
|
||||
//
|
||||
|
@ -124,7 +119,7 @@ private:
|
|||
bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
|
||||
unsigned StoreAlignment, Value *SplatValue,
|
||||
Instruction *TheStore, const SCEVAddRecExpr *Ev,
|
||||
const SCEV *BECount);
|
||||
const SCEV *BECount, bool NegStride);
|
||||
bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
|
||||
const SCEVAddRecExpr *StoreEv,
|
||||
const SCEVAddRecExpr *LoadEv,
|
||||
|
@ -316,25 +311,27 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
|
|||
// Check to see if the stride matches the size of the store. If so, then we
|
||||
// know that every byte is touched in the loop.
|
||||
unsigned StoreSize = (unsigned)SizeInBits >> 3;
|
||||
const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
|
||||
|
||||
if (!Stride || StoreSize != Stride->getValue()->getValue()) {
|
||||
// TODO: Could also handle negative stride here someday, that will require
|
||||
// the validity check in mayLoopAccessLocation to be updated though.
|
||||
// Enable this to print exact negative strides.
|
||||
if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) {
|
||||
dbgs() << "NEGATIVE STRIDE: " << *SI << "\n";
|
||||
dbgs() << "BB: " << *SI->getParent();
|
||||
}
|
||||
|
||||
const SCEVConstant *ConstStride =
|
||||
dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
|
||||
if (!ConstStride)
|
||||
return false;
|
||||
}
|
||||
|
||||
APInt Stride = ConstStride->getValue()->getValue();
|
||||
if (StoreSize != Stride && StoreSize != -Stride)
|
||||
return false;
|
||||
|
||||
bool NegStride = StoreSize == -Stride;
|
||||
|
||||
// See if we can optimize just this store in isolation.
|
||||
if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
|
||||
StoredVal, SI, StoreEv, BECount))
|
||||
StoredVal, SI, StoreEv, BECount, NegStride))
|
||||
return true;
|
||||
|
||||
// TODO: We don't handle negative stride memcpys.
|
||||
if (NegStride)
|
||||
return false;
|
||||
|
||||
// If the stored value is a strided load in the same loop with the same stride
|
||||
// this this may be transformable into a memcpy. This kicks in for stuff like
|
||||
// for (i) A[i] = B[i];
|
||||
|
@ -387,7 +384,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
|
|||
|
||||
return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
|
||||
MSI->getAlignment(), MSI->getValue(), MSI, Ev,
|
||||
BECount);
|
||||
BECount, /*NegStride=*/false);
|
||||
}
|
||||
|
||||
/// mayLoopAccessLocation - Return true if the specified loop might access the
|
||||
|
@ -468,7 +465,7 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) {
|
|||
bool LoopIdiomRecognize::processLoopStridedStore(
|
||||
Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
|
||||
Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
|
||||
const SCEV *BECount) {
|
||||
const SCEV *BECount, bool NegStride) {
|
||||
|
||||
// If the stored value is a byte-wise value (like i32 -1), then it may be
|
||||
// turned into a memset of i8 -1, assuming that all the consecutive bytes
|
||||
|
@ -506,15 +503,27 @@ bool LoopIdiomRecognize::processLoopStridedStore(
|
|||
SCEVExpander Expander(*SE, DL, "loop-idiom");
|
||||
|
||||
Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
|
||||
Type *IntPtr = Builder.getIntPtrTy(DL, DestAS);
|
||||
|
||||
const SCEV *Start = Ev->getStart();
|
||||
// If we have a negative stride, Start refers to the end of the memory
|
||||
// location we're trying to memset. Therefore, we need to recompute the start
|
||||
// point, which is just Start - BECount*Size.
|
||||
if (NegStride) {
|
||||
const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
|
||||
if (StoreSize != 1)
|
||||
Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
|
||||
SCEV::FlagNUW);
|
||||
Start = SE->getMinusSCEV(Ev->getStart(), Index);
|
||||
}
|
||||
|
||||
// Okay, we have a strided store "p[i]" of a splattable value. We can turn
|
||||
// this into a memset in the loop preheader now if we want. However, this
|
||||
// would be unsafe to do if there is anything else in the loop that may read
|
||||
// or write to the aliased location. Check for any overlap by generating the
|
||||
// base pointer and checking the region.
|
||||
Value *BasePtr = Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy,
|
||||
Preheader->getTerminator());
|
||||
|
||||
Value *BasePtr =
|
||||
Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
|
||||
if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
|
||||
*AA, TheStore)) {
|
||||
Expander.clear();
|
||||
|
@ -527,7 +536,6 @@ bool LoopIdiomRecognize::processLoopStridedStore(
|
|||
|
||||
// The # stored bytes is (BECount+1)*Size. Expand the trip count out to
|
||||
// pointer size if it isn't already.
|
||||
Type *IntPtr = Builder.getIntPtrTy(DL, DestAS);
|
||||
BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
|
||||
|
||||
const SCEV *NumBytesS =
|
||||
|
|
|
@ -424,3 +424,83 @@ exit:
|
|||
ret void
|
||||
; CHECK: ret void
|
||||
}
|
||||
|
||||
; Recognize loops with a negative stride.
|
||||
define void @test15(i32* nocapture %f) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
|
||||
store i32 0, i32* %arrayidx, align 4
|
||||
%indvars.iv.next = add nsw i64 %indvars.iv, -1
|
||||
%cmp = icmp sgt i64 %indvars.iv, 0
|
||||
br i1 %cmp, label %for.body, label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup:
|
||||
ret void
|
||||
; CHECK-LABEL: @test15(
|
||||
; CHECK: call void @llvm.memset.p0i8.i64(i8* %f1, i8 0, i64 262148, i32 4, i1 false)
|
||||
; CHECK-NOT: store
|
||||
; CHECK: ret void
|
||||
}
|
||||
|
||||
; Loop with a negative stride. Verify an aliasing write to f[65536] prevents
|
||||
; the creation of a memset.
|
||||
define void @test16(i32* nocapture %f) {
|
||||
entry:
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %f, i64 65536
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
|
||||
store i32 0, i32* %arrayidx, align 4
|
||||
store i32 1, i32* %arrayidx1, align 4
|
||||
%indvars.iv.next = add nsw i64 %indvars.iv, -1
|
||||
%cmp = icmp sgt i64 %indvars.iv, 0
|
||||
br i1 %cmp, label %for.body, label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body
|
||||
ret void
|
||||
; CHECK-LABEL: @test16(
|
||||
; CHECK-NOT: call void @llvm.memset.p0i8.i64
|
||||
; CHECK: ret void
|
||||
}
|
||||
|
||||
; We don't handle memcpy-able loops with negative stride.
|
||||
define noalias i32* @test17(i32* nocapture readonly %a, i32 %c) {
|
||||
entry:
|
||||
%conv = sext i32 %c to i64
|
||||
%mul = shl nsw i64 %conv, 2
|
||||
%call = tail call noalias i8* @malloc(i64 %mul)
|
||||
%0 = bitcast i8* %call to i32*
|
||||
%tobool.9 = icmp eq i32 %c, 0
|
||||
br i1 %tobool.9, label %while.end, label %while.body.preheader
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%dec10.in = phi i32 [ %dec10, %while.body ], [ %c, %while.body.preheader ]
|
||||
%dec10 = add nsw i32 %dec10.in, -1
|
||||
%idxprom = sext i32 %dec10 to i64
|
||||
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
|
||||
%1 = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %0, i64 %idxprom
|
||||
store i32 %1, i32* %arrayidx2, align 4
|
||||
%tobool = icmp eq i32 %dec10, 0
|
||||
br i1 %tobool, label %while.end.loopexit, label %while.body
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
ret i32* %0
|
||||
; CHECK-LABEL: @test17(
|
||||
; CHECK-NOT: call void @llvm.memcpy
|
||||
; CHECK: ret i32*
|
||||
}
|
||||
|
||||
declare noalias i8* @malloc(i64)
|
||||
|
|
Loading…
Reference in New Issue