[LV] Parallel annotated loop does not imply all loads can be hoisted.

As noted in https://bugs.llvm.org/show_bug.cgi?id=46666, the current behavior of assuming if-conversion safety if a loop is annotated parallel (`!llvm.loop.parallel_accesses`), is not expectable, the documentation for this behavior was since removed from the LangRef again, and can lead to invalid reads.
This was observed in POCL (https://github.com/pocl/pocl/issues/757) and would require similar workarounds in current work at hipSYCL.

The question remains why this was initially added and what the implications of removing this optimization would be.
Do we need an alternative mechanism to propagate the information about legality of if-conversion?
Or is the idea that conditional loads in `#pragma clang loop vectorize(assume_safety)` can be executed unmasked without additional checks flawed in general?
I think this implication is not part of what a user of that pragma (and corresponding metadata) would expect and thus dangerous.

Only two additional tests failed, which are adapted in this patch. Depending on the further direction force-ifcvt.ll should be removed or further adapted.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D103907
This commit is contained in:
Joachim Meyer 2021-06-08 18:16:08 +02:00
parent 7b969ef8b4
commit 4f01122c3f
4 changed files with 10 additions and 65 deletions

View File

@ -435,22 +435,17 @@ private:
bool canVectorizeOuterLoop();
/// Return true if all of the instructions in the block can be speculatively
/// executed, and record the loads/stores that require masking. If's that
/// guard loads can be ignored under "assume safety" unless \p PreserveGuards
/// is true. This can happen when we introduces guards for which the original
/// "unguarded-loads are safe" assumption does not hold. For example, the
/// vectorizer's fold-tail transformation changes the loop to execute beyond
/// its original trip-count, under a proper guard, which should be preserved.
/// executed, and record the loads/stores that require masking.
/// \p SafePtrs is a list of addresses that are known to be legal and we know
/// that we can read from them without segfault.
/// \p MaskedOp is a list of instructions that have to be transformed into
/// calls to the appropriate masked intrinsic when the loop is vectorized.
/// \p ConditionalAssumes is a list of assume instructions in predicated
/// blocks that must be dropped if the CFG gets flattened.
bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
bool blockCanBePredicated(
BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
SmallPtrSetImpl<const Instruction *> &MaskedOp,
SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
bool PreserveGuards = false) const;
SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const;
/// Updates the vectorization state by adding \p Phi to the inductions list.
/// This can set \p Phi as the main induction of the loop if \p Phi is a

View File

@ -955,10 +955,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
bool LoopVectorizationLegality::blockCanBePredicated(
BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
SmallPtrSetImpl<const Instruction *> &MaskedOp,
SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
bool PreserveGuards) const {
const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
for (Instruction &I : *BB) {
// Check that we don't have a constant expression that can trap as operand.
for (Value *Operand : I.operands()) {
@ -986,10 +983,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
if (!LI)
return false;
if (!SafePtrs.count(LI->getPointerOperand())) {
// !llvm.mem.parallel_loop_access implies if-conversion safety.
// Otherwise, record that the load needs (real or emulated) masking
// and let the cost model decide.
if (!IsAnnotatedParallel || PreserveGuards)
MaskedOp.insert(LI);
continue;
}
@ -1306,8 +1299,7 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
// do not need predication such as the header block.
for (BasicBlock *BB : TheLoop->blocks()) {
if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
TmpConditionalAssumes,
/* MaskAllLoads= */ true)) {
TmpConditionalAssumes)) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
return false;
}

View File

@ -1,42 +0,0 @@
; RUN: opt -loop-vectorize -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse nounwind uwtable
define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 {
entry:
br label %for.body
; CHECK-LABEL: @Test
; CHECK: <4 x i32>
for.body: ; preds = %cond.end, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ]
%arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !1
%cmp1 = icmp eq i32 %0, 0
%arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv
%1 = load i32, i32* %arrayidx3, align 4, !llvm.access.group !1
br i1 %cmp1, label %cond.end, label %cond.false
cond.false: ; preds = %for.body
%arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
%2 = load i32, i32* %arrayidx7, align 4, !llvm.access.group !1
%add = add nsw i32 %2, %1
br label %cond.end
cond.end: ; preds = %for.body, %cond.false
%cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ]
store i32 %cond, i32* %arrayidx3, align 4, !llvm.access.group !1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 16
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
for.end: ; preds = %cond.end
ret void
}
attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
!1 = distinct !{}

View File

@ -51,7 +51,7 @@ for.inc:
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
}
; Case2: With pragma assume_safety only the store is masked.
; Case2: With pragma assume_safety both, load and store are masked.
; void assume_safety(int * p, int * q1, int * q2, int guard) {
; #pragma clang loop vectorize(assume_safety)
; for(int ix=0; ix < 1021; ++ix) {
@ -63,7 +63,7 @@ for.inc:
;CHECK-LABEL: @assume_safety
;CHECK: vector.body:
;CHECK-NOT: @llvm.masked.load
;CHECK: call <8 x i32> @llvm.masked.load
;CHECK: call void @llvm.masked.store
; Function Attrs: norecurse nounwind uwtable