[SLP] Explicit track required stacksave/alloca dependency

The semantics of an inalloca alloca instruction requires that it not be reordered with a preceeding stacksave intrinsic call. Unfortunately, there's no def/use edge or memory dependence edge. (THe memory point is slightly subtle, but in general a new allocation can't alias with a call which executes strictly before it comes into existance.) I'd tried to tackle this same case previously in 689babdf6, but the fix chosen there turned out to be incomplete. As such, this change contains a fully revert of the first fix attempt. This was noticed when investigating problems which surfaced with D118538, but this is definitely an existing bug. This time around, I managed to reduce a couple of additional cases, including one which was being actively miscompiled even without the new scheduling change. (See test diffs) Compile time wise, we only spend extra time when seeing a stacksave (rare), and even then we walk the block at most once per schedule window extension. Likely a non-issue.
2022-03-20 13:50:36 -07:00 · 2022-03-20 13:50:36 -07:00 · b7806c8b37
parent c1a31ee65b
commit b7806c8b37
2 changed files with 48 additions and 36 deletions
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -4023,16 +4023,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
    return;
  }

-  // Avoid attempting to schedule allocas; there are unmodeled dependencies
-  // for "static" alloca status and for reordering with stacksave calls.
-  for (Value *V : VL) {
-    if (isa<AllocaInst>(V)) {
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to alloca.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
-      return;
-    }
-  }
-
  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
    if (SI->getValueOperand()->getType()->isVectorTy()) {
      LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
@ -8086,6 +8076,33 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
        }
      }

+      // If we have an inalloc alloca instruction, it needs to be scheduled
+      // after any preceeding stacksave.
+      if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>())) {
+        for (Instruction *I = BundleMember->Inst->getNextNode();
+             I != ScheduleEnd; I = I->getNextNode()) {
+          if (match(I, m_Intrinsic<Intrinsic::stacksave>()))
+            // Any allocas past here must be control dependent on I, and I
+            // must be memory dependend on BundleMember->Inst.
+            break;
+
+          if (!isa<AllocaInst>(I))
+            continue;
+
+          // Add the dependency
+          auto *DepDest = getScheduleData(I);
+          assert(DepDest && "must be in schedule window");
+          DepDest->ControlDependencies.push_back(BundleMember);
+          BundleMember->Dependencies++;
+          ScheduleData *DestBundle = DepDest->FirstInBundle;
+          if (!DestBundle->IsScheduled)
+            BundleMember->incrementUnscheduledDeps(1);
+          if (!DestBundle->hasValidDependencies())
+            WorkList.push_back(DestBundle);
+        }
+      }
+
+
      // Handle the memory dependencies (if any).
      ScheduleData *DepDest = BundleMember->NextLoadStore;
      if (!DepDest)
@ -8180,10 +8197,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
  // For the real scheduling we use a more sophisticated ready-list: it is
  // sorted by the original instruction location. This lets the final schedule
  // be as  close as possible to the original instruction order.
-  // WARNING: This required for correctness in the following case:
-  // * We must prevent reordering of allocas with stacksave intrinsic calls.
-  // We rely on two instructions which are both ready (per the subgraph) not
-  // to be reordered.
+  // WARNING: If changing this order causes a correctness issue, that means
+  // there is some missing dependence edge in the schedule data graph.
  struct ScheduleDataCompare {
    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
      return SD2->SchedulingPriority < SD1->SchedulingPriority;
--- a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll
@ -89,22 +89,22 @@ define void @allocas_speculation(i8** %a, i8** %b, i8** %c) {
  ret void
 }

-; FIXME: This is wrong because we life the alloca out of the region
+; We must be careful not to lift the inalloca alloc above the stacksave here.
+; We used to miscompile this example before adding explicit dependency handling
+; for stacksave.
 define void @stacksave(i8** %a, i8** %b, i8** %c) {
 ; CHECK-LABEL: @stacksave(
 ; CHECK-NEXT:    [[V1:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    [[V2:%.*]] = alloca inalloca i8, align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[V1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[V2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8*> [[TMP3]], i32 0
-; CHECK-NEXT:    store i8* [[TMP4]], i8** [[A:%.*]], align 8
+; CHECK-NEXT:    [[ADD1:%.*]] = getelementptr i8, i8* [[V1]], i32 1
+; CHECK-NEXT:    store i8* [[ADD1]], i8** [[A:%.*]], align 8
 ; CHECK-NEXT:    [[STACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[V2:%.*]] = alloca inalloca i8, align 1
 ; CHECK-NEXT:    call void @use(i8* inalloca(i8) [[V2]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[STACK]])
-; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
-; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP5]], align 8
+; CHECK-NEXT:    [[ADD2:%.*]] = getelementptr i8, i8* [[V2]], i32 1
+; CHECK-NEXT:    store i8* [[ADD1]], i8** [[B:%.*]], align 8
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B]], i32 1
+; CHECK-NEXT:    store i8* [[ADD2]], i8** [[B2]], align 8
 ; CHECK-NEXT:    ret void
 ;

@ -295,15 +295,13 @@ define void @ham() #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8** [[VAR32]] to <4 x i8*>*
 ; CHECK-NEXT:    store <4 x i8*> [[SHUFFLE]], <4 x i8*>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[VAR36:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 4
-; CHECK-NEXT:    store i8* [[VAR4]], i8** [[VAR36]], align 8
 ; CHECK-NEXT:    [[VAR37:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 5
 ; CHECK-NEXT:    [[VAR38:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i8*> poison, i8* [[VAR5]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8*> [[TMP3]], i8* [[VAR5]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8** [[VAR37]] to <2 x i8*>*
-; CHECK-NEXT:    store <2 x i8*> [[TMP4]], <2 x i8*>* [[TMP5]], align 8
 ; CHECK-NEXT:    [[VAR39:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 7
-; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR39]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8*> [[TMP1]], i8* [[VAR5]], i32 1
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i8*> [[TMP3]], <4 x i8*> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[VAR36]] to <4 x i8*>*
+; CHECK-NEXT:    store <4 x i8*> [[SHUFFLE1]], <4 x i8*>* [[TMP4]], align 8
 ; CHECK-NEXT:    ret void
 ;
  %var2 = alloca i8
@ -343,15 +341,14 @@ define void @spam() #1 {
 ; CHECK-NEXT:    [[VAR5:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    [[VAR12:%.*]] = alloca [12 x i8*], align 8
 ; CHECK-NEXT:    [[VAR36:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 4
-; CHECK-NEXT:    store i8* [[VAR4]], i8** [[VAR36]], align 8
 ; CHECK-NEXT:    [[VAR37:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 5
 ; CHECK-NEXT:    [[VAR38:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[VAR5]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[VAR5]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8** [[VAR37]] to <2 x i8*>*
-; CHECK-NEXT:    store <2 x i8*> [[TMP2]], <2 x i8*>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[VAR39:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 7
-; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR39]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8*> poison, i8* [[VAR4]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8*> [[TMP1]], i8* [[VAR5]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i8*> [[TMP2]], <4 x i8*> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8** [[VAR36]] to <4 x i8*>*
+; CHECK-NEXT:    store <4 x i8*> [[SHUFFLE]], <4 x i8*>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
  %var4 = alloca i8