From 4f01122c3f6c70beee8f736f196a09976602685f Mon Sep 17 00:00:00 2001 From: Joachim Meyer Date: Tue, 8 Jun 2021 18:16:08 +0200 Subject: [PATCH] [LV] Parallel annotated loop does not imply all loads can be hoisted. As noted in https://bugs.llvm.org/show_bug.cgi?id=46666, the current behavior of assuming if-conversion safety if a loop is annotated parallel (`!llvm.loop.parallel_accesses`), is not expectable, the documentation for this behavior was since removed from the LangRef again, and can lead to invalid reads. This was observed in POCL (https://github.com/pocl/pocl/issues/757) and would require similar workarounds in current work at hipSYCL. The question remains why this was initially added and what the implications of removing this optimization would be. Do we need an alternative mechanism to propagate the information about legality of if-conversion? Or is the idea that conditional loads in `#pragma clang loop vectorize(assume_safety)` can be executed unmasked without additional checks flawed in general? I think this implication is not part of what a user of that pragma (and corresponding metadata) would expect and thus dangerous. Only two additional tests failed, which are adapted in this patch. Depending on the further direction force-ifcvt.ll should be removed or further adapted. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D103907 --- .../Vectorize/LoopVectorizationLegality.h | 15 +++---- .../Vectorize/LoopVectorizationLegality.cpp | 14 ++----- .../LoopVectorize/X86/force-ifcvt.ll | 42 ------------------- .../X86/tail_folding_and_assume_safety.ll | 4 +- 4 files changed, 10 insertions(+), 65 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 9da5ce5ca157..e7dcdda8af89 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -435,22 +435,17 @@ private: bool canVectorizeOuterLoop(); /// Return true if all of the instructions in the block can be speculatively - /// executed, and record the loads/stores that require masking. If's that - /// guard loads can be ignored under "assume safety" unless \p PreserveGuards - /// is true. This can happen when we introduces guards for which the original - /// "unguarded-loads are safe" assumption does not hold. For example, the - /// vectorizer's fold-tail transformation changes the loop to execute beyond - /// its original trip-count, under a proper guard, which should be preserved. + /// executed, and record the loads/stores that require masking. /// \p SafePtrs is a list of addresses that are known to be legal and we know /// that we can read from them without segfault. /// \p MaskedOp is a list of instructions that have to be transformed into /// calls to the appropriate masked intrinsic when the loop is vectorized. /// \p ConditionalAssumes is a list of assume instructions in predicated /// blocks that must be dropped if the CFG gets flattened. - bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs, - SmallPtrSetImpl &MaskedOp, - SmallPtrSetImpl &ConditionalAssumes, - bool PreserveGuards = false) const; + bool blockCanBePredicated( + BasicBlock *BB, SmallPtrSetImpl &SafePtrs, + SmallPtrSetImpl &MaskedOp, + SmallPtrSetImpl &ConditionalAssumes) const; /// Updates the vectorization state by adding \p Phi to the inductions list. /// This can set \p Phi as the main induction of the loop if \p Phi is a diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 21f2378bddb9..1c08e77a0d4f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -955,10 +955,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { bool LoopVectorizationLegality::blockCanBePredicated( BasicBlock *BB, SmallPtrSetImpl &SafePtrs, SmallPtrSetImpl &MaskedOp, - SmallPtrSetImpl &ConditionalAssumes, - bool PreserveGuards) const { - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - + SmallPtrSetImpl &ConditionalAssumes) const { for (Instruction &I : *BB) { // Check that we don't have a constant expression that can trap as operand. for (Value *Operand : I.operands()) { @@ -986,11 +983,7 @@ bool LoopVectorizationLegality::blockCanBePredicated( if (!LI) return false; if (!SafePtrs.count(LI->getPointerOperand())) { - // !llvm.mem.parallel_loop_access implies if-conversion safety. - // Otherwise, record that the load needs (real or emulated) masking - // and let the cost model decide. - if (!IsAnnotatedParallel || PreserveGuards) - MaskedOp.insert(LI); + MaskedOp.insert(LI); continue; } } @@ -1306,8 +1299,7 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes, - /* MaskAllLoads= */ true)) { + TmpConditionalAssumes)) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll b/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll deleted file mode 100644 index 07b98b4fb001..000000000000 --- a/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: opt -loop-vectorize -S < %s | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: norecurse nounwind uwtable -define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 { -entry: - br label %for.body - -; CHECK-LABEL: @Test -; CHECK: <4 x i32> - -for.body: ; preds = %cond.end, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ] - %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv - %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !1 - %cmp1 = icmp eq i32 %0, 0 - %arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv - %1 = load i32, i32* %arrayidx3, align 4, !llvm.access.group !1 - br i1 %cmp1, label %cond.end, label %cond.false - -cond.false: ; preds = %for.body - %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv - %2 = load i32, i32* %arrayidx7, align 4, !llvm.access.group !1 - %add = add nsw i32 %2, %1 - br label %cond.end - -cond.end: ; preds = %for.body, %cond.false - %cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ] - store i32 %cond, i32* %arrayidx3, align 4, !llvm.access.group !1 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 16 - br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 - -for.end: ; preds = %cond.end - ret void -} - -attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" } - -!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}} -!1 = distinct !{} diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll index 10477ddce9c9..5f9c477746db 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll @@ -51,7 +51,7 @@ for.inc: br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8 } -; Case2: With pragma assume_safety only the store is masked. +; Case2: With pragma assume_safety both, load and store are masked. ; void assume_safety(int * p, int * q1, int * q2, int guard) { ; #pragma clang loop vectorize(assume_safety) ; for(int ix=0; ix < 1021; ++ix) { @@ -63,7 +63,7 @@ for.inc: ;CHECK-LABEL: @assume_safety ;CHECK: vector.body: -;CHECK-NOT: @llvm.masked.load +;CHECK: call <8 x i32> @llvm.masked.load ;CHECK: call void @llvm.masked.store ; Function Attrs: norecurse nounwind uwtable