[OpaquePtr][LoopAccessAnalysis] Support opaque pointers

Previously we relied on the pointee type to determine what type we need to do runtime pointer access checks. With opaque pointers, we can access a pointer with more than one type, so now we keep track of all the types we're accessing a pointer's memory with. Also some other minor getPointerElementType() removals. Reviewed By: #opaque-pointers, nikic Differential Revision: https://reviews.llvm.org/D119047
2022-02-07 18:16:08 -08:00 · 2022-02-07 18:16:08 -08:00 · ff31020ee6
parent 022baf71ed
commit ff31020ee6
3 changed files with 244 additions and 54 deletions
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@ -406,8 +406,8 @@ public:
  /// according to the assumptions that we've made during the analysis.
  /// The method might also version the pointer stride according to \p Strides,
  /// and add new predicates to \p PSE.
-  void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
-              unsigned ASId, const ValueToValueMap &Strides,
+  void insert(Loop *Lp, Value *Ptr, Type *AccessTy, bool WritePtr,
+              unsigned DepSetId, unsigned ASId, const ValueToValueMap &Strides,
              PredicatedScalarEvolution &PSE);

  /// No run-time memory checking is necessary.
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@ -189,8 +189,9 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
 ///
 /// There is no conflict when the intervals are disjoint:
 /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
-void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
-                                    unsigned DepSetId, unsigned ASId,
+void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, Type *AccessTy,
+                                    bool WritePtr, unsigned DepSetId,
+                                    unsigned ASId,
                                    const ValueToValueMap &Strides,
                                    PredicatedScalarEvolution &PSE) {
  // Get the stride replaced scev.
@ -227,8 +228,7 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
  // Add the size of the pointed element to ScEnd.
  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
  Type *IdxTy = DL.getIndexType(Ptr->getType());
-  const SCEV *EltSizeSCEV =
-      SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType());
+  const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
  ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);

  Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
@ -522,19 +522,19 @@ public:
      : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), PSE(PSE) {}

  /// Register a load  and whether it is only read from.
-  void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
+  void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
    Value *Ptr = const_cast<Value*>(Loc.Ptr);
    AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, false));
+    Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
    if (IsReadOnly)
      ReadOnlyPtr.insert(Ptr);
  }

  /// Register a store.
-  void addStore(MemoryLocation &Loc) {
+  void addStore(MemoryLocation &Loc, Type *AccessTy) {
    Value *Ptr = const_cast<Value*>(Loc.Ptr);
    AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, true));
+    Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
  }

  /// Check if we can emit a run-time no-alias check for \p Access.
@ -545,12 +545,11 @@ public:
  /// we will attempt to use additional run-time checks in order to get
  /// the bounds of the pointer.
  bool createCheckForAccess(RuntimePointerChecking &RtCheck,
-                            MemAccessInfo Access,
+                            MemAccessInfo Access, Type *AccessTy,
                            const ValueToValueMap &Strides,
                            DenseMap<Value *, unsigned> &DepSetId,
                            Loop *TheLoop, unsigned &RunningDepId,
-                            unsigned ASId, bool ShouldCheckStride,
-                            bool Assume);
+                            unsigned ASId, bool ShouldCheckStride, bool Assume);

  /// Check whether we can check the pointers at runtime for
  /// non-intersection.
@ -583,14 +582,15 @@ public:
  MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }

 private:
-  typedef SetVector<MemAccessInfo> PtrAccessSet;
+  typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;

  /// Go over all memory access and check whether runtime pointer checks
  /// are needed and build sets of dependency check candidates.
  void processMemAccesses();

-  /// Set of all accesses.
-  PtrAccessSet Accesses;
+  /// Map of all accesses. Values are the types used to access memory pointed to
+  /// by the pointer.
+  PtrAccessMap Accesses;

  /// The loop being checked.
  const Loop *TheLoop;
@ -652,12 +652,12 @@ static bool hasComputableBounds(PredicatedScalarEvolution &PSE,

 /// Check whether a pointer address cannot wrap.
 static bool isNoWrap(PredicatedScalarEvolution &PSE,
-                     const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
+                     const ValueToValueMap &Strides, Value *Ptr, Type *AccessTy,
+                     Loop *L) {
  const SCEV *PtrScev = PSE.getSCEV(Ptr);
  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
    return true;

-  Type *AccessTy = Ptr->getType()->getPointerElementType();
  int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides);
  if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
    return true;
@ -689,7 +689,7 @@ static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
 }

 bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
-                                          MemAccessInfo Access,
+                                          MemAccessInfo Access, Type *AccessTy,
                                          const ValueToValueMap &StridesMap,
                                          DenseMap<Value *, unsigned> &DepSetId,
                                          Loop *TheLoop, unsigned &RunningDepId,
@ -702,7 +702,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,

  // When we run after a failing dependency check we have to make sure
  // we don't have wrapping pointers.
-  if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) {
+  if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
    auto *Expr = PSE.getSCEV(Ptr);
    if (!Assume || !isa<SCEVAddRecExpr>(Expr))
      return false;
@ -723,11 +723,11 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
    DepId = RunningDepId++;

  bool IsWrite = Access.getInt();
-  RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
+  RtCheck.insert(TheLoop, Ptr, AccessTy, IsWrite, DepId, ASId, StridesMap, PSE);
  LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');

  return true;
- }
+}

 bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                     ScalarEvolution *SE, Loop *TheLoop,
@ -788,12 +788,15 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
    }

    for (auto &Access : AccessInfos) {
-      if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
-                                RunningDepId, ASId, ShouldCheckWrap, false)) {
-        LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
-                          << *Access.getPointer() << '\n');
-        Retries.push_back(Access);
-        CanDoAliasSetRT = false;
+      for (auto &AccessTy : Accesses[Access]) {
+        if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
+                                  DepSetId, TheLoop, RunningDepId, ASId,
+                                  ShouldCheckWrap, false)) {
+          LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
+                            << *Access.getPointer() << '\n');
+          Retries.push_back(Access);
+          CanDoAliasSetRT = false;
+        }
      }
    }

@ -815,13 +818,16 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
      // We know that we need these checks, so we can now be more aggressive
      // and add further checks if required (overflow checks).
      CanDoAliasSetRT = true;
-      for (auto Access : Retries)
-        if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId,
-                                  TheLoop, RunningDepId, ASId,
-                                  ShouldCheckWrap, /*Assume=*/true)) {
-          CanDoAliasSetRT = false;
-          break;
+      for (auto Access : Retries) {
+        for (auto &AccessTy : Accesses[Access]) {
+          if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
+                                    DepSetId, TheLoop, RunningDepId, ASId,
+                                    ShouldCheckWrap, /*Assume=*/true)) {
+            CanDoAliasSetRT = false;
+            break;
+          }
        }
+      }
    }

    CanDoRT &= CanDoAliasSetRT;
@ -886,9 +892,12 @@ void AccessAnalysis::processMemAccesses() {
  LLVM_DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
  LLVM_DEBUG({
    for (auto A : Accesses)
-      dbgs() << "\t" << *A.getPointer() << " (" <<
-                (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
-                                         "read-only" : "read")) << ")\n";
+      dbgs() << "\t" << *A.first.getPointer() << " ("
+             << (A.first.getInt()
+                     ? "write"
+                     : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
+                                                                : "read"))
+             << ")\n";
  });

  // The AliasSetTracker has nicely partitioned our pointers by metadata
@ -907,13 +916,13 @@ void AccessAnalysis::processMemAccesses() {
    UnderlyingObjToAccessMap ObjToLastAccess;

    // Set of access to check after all writes have been processed.
-    PtrAccessSet DeferredAccesses;
+    PtrAccessMap DeferredAccesses;

    // Iterate over each alias set twice, once to process read/write pointers,
    // and then to process read-only pointers.
    for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
      bool UseDeferred = SetIteration > 0;
-      PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
+      PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;

      for (const auto &AV : AS) {
        Value *Ptr = AV.getValue();
@ -921,10 +930,10 @@ void AccessAnalysis::processMemAccesses() {
        // For a single memory access in AliasSetTracker, Accesses may contain
        // both read and write, and they both need to be handled for CheckDeps.
        for (const auto &AC : S) {
-          if (AC.getPointer() != Ptr)
+          if (AC.first.getPointer() != Ptr)
            continue;

-          bool IsWrite = AC.getInt();
+          bool IsWrite = AC.first.getInt();

          // If we're using the deferred access set, then it contains only
          // reads.
@ -946,7 +955,9 @@ void AccessAnalysis::processMemAccesses() {
          // consecutive as "read-only" pointers (so that we check
          // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
          if (!UseDeferred && IsReadOnlyPtr) {
-            DeferredAccesses.insert(Access);
+            // We only use the pointer keys, the types vector values don't
+            // matter.
+            DeferredAccesses.insert({Access, {}});
            continue;
          }

@ -1518,8 +1529,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
  Value *BPtr = B.getPointer();
  bool AIsWrite = A.getInt();
  bool BIsWrite = B.getInt();
-  Type *ATy = APtr->getType()->getPointerElementType();
-  Type *BTy = BPtr->getType()->getPointerElementType();
+  Type *ATy = getLoadStoreType(InstMap[AIdx]);
+  Type *BTy = getLoadStoreType(InstMap[BIdx]);

  // Two reads are independent.
  if (!AIsWrite && !BIsWrite)
@ -1842,8 +1853,6 @@ bool LoopAccessInfo::canAnalyzeLoop() {
 void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
                                 const TargetLibraryInfo *TLI,
                                 DominatorTree *DT) {
-  typedef SmallPtrSet<Value*, 16> ValueSet;
-
  // Holds the Load and Store instructions.
  SmallVector<LoadInst *, 16> Loads;
  SmallVector<StoreInst *, 16> Stores;
@ -1975,11 +1984,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
  // for read and once for write, it will only appear once (on the write
  // list). This is okay, since we are going to check for conflicts between
  // writes and between reads and writes, but not between reads and reads.
-  ValueSet Seen;
+  SmallSet<std::pair<Value *, Type *>, 16> Seen;

  // Record uniform store addresses to identify if we have multiple stores
  // to the same address.
-  ValueSet UniformStores;
+  SmallPtrSet<Value *, 16> UniformStores;

  for (StoreInst *ST : Stores) {
    Value *Ptr = ST->getPointerOperand();
@ -1990,7 +1999,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,

    // If we did *not* see this pointer before, insert it to  the read-write
    // list. At this phase it is only a 'write' list.
-    if (Seen.insert(Ptr).second) {
+    Type *AccessTy = getLoadStoreType(ST);
+    if (Seen.insert({Ptr, AccessTy}).second) {
      ++NumReadWrites;

      MemoryLocation Loc = MemoryLocation::get(ST);
@ -2001,9 +2011,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
        Loc.AATags.TBAA = nullptr;

      visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
-                    [&Accesses, Loc](Value *Ptr) {
+                    [&Accesses, AccessTy, Loc](Value *Ptr) {
                      MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
-                      Accesses.addStore(NewLoc);
+                      Accesses.addStore(NewLoc, AccessTy);
                    });
    }
  }
@ -2027,7 +2037,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
    // read a few words, modify, and write a few words, and some of the
    // words may be written to the same address.
    bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second ||
+    Type *AccessTy = getLoadStoreType(LD);
+    if (Seen.insert({Ptr, AccessTy}).second ||
        !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides)) {
      ++NumReads;
      IsReadOnlyPtr = true;
@ -2049,9 +2060,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
      Loc.AATags.TBAA = nullptr;

    visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
-                  [&Accesses, Loc, IsReadOnlyPtr](Value *Ptr) {
+                  [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
                    MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
-                    Accesses.addLoad(NewLoc, IsReadOnlyPtr);
+                    Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
                  });
  }

--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types_opaque_ptr.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types_opaque_ptr.ll
@ -0,0 +1,179 @@
+; RUN: opt -S -disable-output --opaque-pointers -passes='print-access-info' < %s 2>&1 | FileCheck %s
+
+; In the function below some of the accesses are done as float types and some
+; are done as i32 types. When doing dependence analysis the type should not
+; matter if it can be determined that they are the same size.
+
+%int_pair = type { i32, i32 }
+
+; CHECK-LABEL: function 'backdep_type_size_equivalence':
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with a maximum dependence distance of 800 bytes
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %ld.f32 = load float, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i32 %indvars.iv.i32, ptr %gep.iv, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %ld.f32 = load float, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store float %val, ptr %gep.iv.min.100, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store float %val, ptr %gep.iv.min.100, align 8 ->
+; CHECK-NEXT:            store i32 %indvars.iv.i32, ptr %gep.iv, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Run-time memory checks:
+; CHECK-NEXT:        Grouped accesses:
+
+define void @backdep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+entry:
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ]
+
+  ;; Load from vec[indvars.iv].x as float
+  %gep.iv = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv, i32 0
+  %ld.f32 = load float, ptr %gep.iv, align 8
+  %val = fmul fast float %ld.f32, 5.0
+
+  ;; Store to vec[indvars.iv - 100].x as float
+  %indvars.iv.min.100 = add nsw i64 %indvars.iv, -100
+  %gep.iv.min.100 = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv.min.100, i32 0
+  store float %val, ptr %gep.iv.min.100, align 8
+
+  ;; Store to vec[indvars.iv].x as i32, creating a backward dependency between
+  ;; the two stores with different element types but the same element size.
+  %indvars.iv.i32 = trunc i64 %indvars.iv to i32
+  store i32 %indvars.iv.i32, ptr %gep.iv, align 8
+
+  ;; Store to vec[indvars.iv].y as i32, strided accesses should be independent
+  ;; between the two stores with different element types but the same element size.
+  %gep.iv.1 = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv, i32 1
+  store i32 %indvars.iv.i32, ptr %gep.iv.1, align 8
+
+  ;; Store to vec[indvars.iv + n].y as i32, to verify no dependence in the case
+  ;; of unknown dependence distance.
+  %indvars.iv.n = add nuw nsw i64 %indvars.iv, %n
+  %gep.iv.n = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv.n, i32 1
+  store i32 %indvars.iv.i32, ptr %gep.iv.n, align 8
+
+  ;; Loop condition.
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; In the function below one of the accesses is done as i19 type, which has a
+; different store size than the i32 type, even though their alloc sizes are
+; equivalent. This is a negative test to ensure that they are not analyzed as
+; in the tests above.
+;
+; CHECK-LABEL: function 'backdep_type_store_size_equivalence':
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
+; CHECK-NEXT:      Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.f32 = load float, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i19 %indvars.iv.i19, ptr %gep.iv, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Run-time memory checks:
+; CHECK-NEXT:        Grouped accesses:
+
+define void @backdep_type_store_size_equivalence(ptr nocapture %vec, i64 %n) {
+entry:
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ]
+
+  ;; Load from vec[indvars.iv].x as float
+  %gep.iv = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv, i32 0
+  %ld.f32 = load float, ptr %gep.iv, align 8
+  %val = fmul fast float %ld.f32, 5.0
+
+  ;; Store to vec[indvars.iv].x as i19.
+  %indvars.iv.i19 = trunc i64 %indvars.iv to i19
+  store i19 %indvars.iv.i19, ptr %gep.iv, align 8
+
+  ;; Loop condition.
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; In the function below some of the accesses are done as double types and some
+; are done as i64 and i32 types. This is a negative test to ensure that they
+; are not analyzed as in the tests above.
+
+; CHECK-LABEL: function 'neg_dist_dep_type_size_equivalence':
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
+; CHECK-NEXT:      Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:		         store i32 %ld.i64.i32, ptr %gep.iv.n, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:		         store i32 %ld.i64.i32, ptr %gep.iv.n, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store double %val, ptr %gep.iv.101, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:            store double %val, ptr %gep.iv.101, align 8 ->
+; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            store double %val, ptr %gep.iv.101, align 8 ->
+; CHECK-NEXT:		         store i32 %ld.i64.i32, ptr %gep.iv.n, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+
+define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+entry:
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ]
+
+  ;; Load from vec[indvars.iv] as double
+  %gep.iv = getelementptr i64, ptr %vec, i64 %indvars.iv
+  %ld.f64 = load double, ptr %gep.iv, align 8
+  %val = fmul fast double %ld.f64, 5.0
+
+  ;; Store to vec[indvars.iv + 101] as double
+  %indvars.iv.101 = add nsw i64 %indvars.iv, 101
+  %gep.iv.101 = getelementptr i64, ptr %vec, i64 %indvars.iv.101
+  store double %val, ptr %gep.iv.101, align 8
+
+  ;; Read from vec[indvars.iv] as i64 creating
+  ;; a forward but prevents forwarding dependence
+  ;; with different types but same sizes.
+  %ld.i64 = load i64, ptr %gep.iv, align 8
+
+  ;; Different sizes
+  %indvars.iv.n = add nuw nsw i64 %indvars.iv, %n
+  %gep.iv.n = getelementptr inbounds i64, ptr %vec, i64 %indvars.iv.n
+  %ld.i64.i32 = trunc i64 %ld.i64 to i32
+  store i32 %ld.i64.i32, ptr %gep.iv.n, align 8
+
+  ;; Loop condition.
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}