diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ef9526fd3987..ce425f1bf9fb 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -31,11 +31,6 @@ // void foo(_Complex float *P) // for (i) { __real__(*P) = 0; __imag__(*P) = 0; } // -// We should enhance this to handle negative strides through memory. -// Alternatively (and perhaps better) we could rely on an earlier pass to force -// forward iteration through memory, which is generally better for cache -// behavior. Negative strides *do* happen for memset/memcpy loops. -// // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // @@ -124,7 +119,7 @@ private: bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, Value *SplatValue, Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount); + const SCEV *BECount, bool NegStride); bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv, @@ -316,25 +311,27 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. unsigned StoreSize = (unsigned)SizeInBits >> 3; - const SCEVConstant *Stride = dyn_cast(StoreEv->getOperand(1)); - - if (!Stride || StoreSize != Stride->getValue()->getValue()) { - // TODO: Could also handle negative stride here someday, that will require - // the validity check in mayLoopAccessLocation to be updated though. - // Enable this to print exact negative strides. - if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { - dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; - dbgs() << "BB: " << *SI->getParent(); - } + const SCEVConstant *ConstStride = + dyn_cast(StoreEv->getOperand(1)); + if (!ConstStride) return false; - } + + APInt Stride = ConstStride->getValue()->getValue(); + if (StoreSize != Stride && StoreSize != -Stride) + return false; + + bool NegStride = StoreSize == -Stride; // See if we can optimize just this store in isolation. if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), - StoredVal, SI, StoreEv, BECount)) + StoredVal, SI, StoreEv, BECount, NegStride)) return true; + // TODO: We don't handle negative stride memcpys. + if (NegStride) + return false; + // If the stored value is a strided load in the same loop with the same stride // this this may be transformable into a memcpy. This kicks in for stuff like // for (i) A[i] = B[i]; @@ -387,7 +384,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, MSI->getAlignment(), MSI->getValue(), MSI, Ev, - BECount); + BECount, /*NegStride=*/false); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -468,7 +465,7 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { bool LoopIdiomRecognize::processLoopStridedStore( Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount) { + const SCEV *BECount, bool NegStride) { // If the stored value is a byte-wise value (like i32 -1), then it may be // turned into a memset of i8 -1, assuming that all the consecutive bytes @@ -506,15 +503,27 @@ bool LoopIdiomRecognize::processLoopStridedStore( SCEVExpander Expander(*SE, DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); + + const SCEV *Start = Ev->getStart(); + // If we have a negative stride, Start refers to the end of the memory + // location we're trying to memset. Therefore, we need to recompute the start + // point, which is just Start - BECount*Size. + if (NegStride) { + const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); + if (StoreSize != 1) + Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), + SCEV::FlagNUW); + Start = SE->getMinusSCEV(Ev->getStart(), Index); + } // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this // would be unsafe to do if there is anything else in the loop that may read // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. - Value *BasePtr = Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, - Preheader->getTerminator()); - + Value *BasePtr = + Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, *AA, TheStore)) { Expander.clear(); @@ -527,7 +536,6 @@ bool LoopIdiomRecognize::processLoopStridedStore( // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll index c633ae95d16f..9743caee25dd 100644 --- a/llvm/test/Transforms/LoopIdiom/basic.ll +++ b/llvm/test/Transforms/LoopIdiom/basic.ll @@ -424,3 +424,83 @@ exit: ret void ; CHECK: ret void } + +; Recognize loops with a negative stride. +define void @test15(i32* nocapture %f) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 0, i32* %arrayidx, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +; CHECK-LABEL: @test15( +; CHECK: call void @llvm.memset.p0i8.i64(i8* %f1, i8 0, i64 262148, i32 4, i1 false) +; CHECK-NOT: store +; CHECK: ret void +} + +; Loop with a negative stride. Verify an aliasing write to f[65536] prevents +; the creation of a memset. +define void @test16(i32* nocapture %f) { +entry: + %arrayidx1 = getelementptr inbounds i32, i32* %f, i64 65536 + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 0, i32* %arrayidx, align 4 + store i32 1, i32* %arrayidx1, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void +; CHECK-LABEL: @test16( +; CHECK-NOT: call void @llvm.memset.p0i8.i64 +; CHECK: ret void +} + +; We don't handle memcpy-able loops with negative stride. +define noalias i32* @test17(i32* nocapture readonly %a, i32 %c) { +entry: + %conv = sext i32 %c to i64 + %mul = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul) + %0 = bitcast i8* %call to i32* + %tobool.9 = icmp eq i32 %c, 0 + br i1 %tobool.9, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %dec10.in = phi i32 [ %dec10, %while.body ], [ %c, %while.body.preheader ] + %dec10 = add nsw i32 %dec10.in, -1 + %idxprom = sext i32 %dec10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 %idxprom + store i32 %1, i32* %arrayidx2, align 4 + %tobool = icmp eq i32 %dec10, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + ret i32* %0 +; CHECK-LABEL: @test17( +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret i32* +} + +declare noalias i8* @malloc(i64)