[MemorySSA] Support invariant.group metadata

The implementation is mostly copied from MemDepAnalysis. We want to look at all loads and stores to the same pointer operand. Bitcasts and zero GEPs of a pointer are considered the same pointer value. We choose the most dominating instruction. Since updating MemorySSA with invariant.group is non-trivial, for now handling of invariant.group is not cached in any way, so it's part of the walker. The number of loads/stores with invariant.group is small for now anyway. We can revisit if this actually noticeably affects compile times. To avoid invariant.group affecting optimized uses, we need to have optimizeUsesInBlock() not use invariant.group in any way. Co-authored-by: Piotr Padlewski <prazek@google.com> Reviewed By: asbirlea, nikic, Prazek Differential Revision: https://reviews.llvm.org/D109134
2021-08-31 12:24:50 -07:00 · 2021-08-31 12:24:50 -07:00 · b493124ae2
parent ff7a332e6f
commit b493124ae2
4 changed files with 265 additions and 84 deletions
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@ -1039,7 +1039,8 @@ public:
  // updated if a new clobber is found by this SkipSelf search. If this
  // additional query becomes heavily used we may decide to cache the result.
  // Walker instantiations will decide how to set the SkipSelf bool.
-  MemoryAccess *getClobberingMemoryAccessBase(MemoryAccess *, unsigned &, bool);
+  MemoryAccess *getClobberingMemoryAccessBase(MemoryAccess *, unsigned &, bool,
+                                              bool UseInvariantGroup = true);
 };

 /// A MemorySSAWalker that does AA walks to disambiguate accesses. It no
@ -1064,6 +1065,11 @@ public:
                                          unsigned &UWL) {
    return Walker->getClobberingMemoryAccessBase(MA, Loc, UWL);
  }
+  // This method is not accessible outside of this file.
+  MemoryAccess *getClobberingMemoryAccessWithoutInvariantGroup(MemoryAccess *MA,
+                                                               unsigned &UWL) {
+    return Walker->getClobberingMemoryAccessBase(MA, UWL, false, false);
+  }

  MemoryAccess *getClobberingMemoryAccess(MemoryAccess *MA) override {
    unsigned UpwardWalkLimit = MaxCheckLimit;
@ -1460,10 +1466,13 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
    unsigned UpwardWalkLimit = MaxCheckLimit;
    while (UpperBound > LocInfo.LowerBound) {
      if (isa<MemoryPhi>(VersionStack[UpperBound])) {
-        // For phis, use the walker, see where we ended up, go there
+        // For phis, use the walker, see where we ended up, go there.
+        // The invariant.group handling in MemorySSA is ad-hoc and doesn't
+        // support updates, so don't use it to optimize uses.
        MemoryAccess *Result =
-            Walker->getClobberingMemoryAccess(MU, UpwardWalkLimit);
-        // We are guaranteed to find it or something is wrong
+            Walker->getClobberingMemoryAccessWithoutInvariantGroup(
+                MU, UpwardWalkLimit);
+        // We are guaranteed to find it or something is wrong.
        while (VersionStack[UpperBound] != Result) {
          assert(UpperBound != 0);
          --UpperBound;
@ -2469,15 +2478,88 @@ MemorySSA::ClobberWalkerBase<AliasAnalysisType>::getClobberingMemoryAccessBase(
  return Clobber;
 }

+static const Instruction *
+getInvariantGroupClobberingInstruction(Instruction &I, DominatorTree &DT) {
+  if (!I.hasMetadata(LLVMContext::MD_invariant_group) || I.isVolatile())
+    return nullptr;
+
+  // We consider bitcasts and zero GEPs to be the same pointer value. Start by
+  // stripping bitcasts and zero GEPs, then we will recursively look at loads
+  // and stores through bitcasts and zero GEPs.
+  Value *PointerOperand = getLoadStorePointerOperand(&I)->stripPointerCasts();
+
+  // It's not safe to walk the use list of a global value because function
+  // passes aren't allowed to look outside their functions.
+  // FIXME: this could be fixed by filtering instructions from outside of
+  // current function.
+  if (isa<Constant>(PointerOperand))
+    return nullptr;
+
+  // Queue to process all pointers that are equivalent to load operand.
+  SmallVector<const Value *, 8> PointerUsesQueue;
+  PointerUsesQueue.push_back(PointerOperand);
+
+  const Instruction *MostDominatingInstruction = &I;
+
+  // FIXME: This loop is O(n^2) because dominates can be O(n) and in worst case
+  // we will see all the instructions. It may not matter in practice. If it
+  // does, we will have to support MemorySSA construction and updates.
+  while (!PointerUsesQueue.empty()) {
+    const Value *Ptr = PointerUsesQueue.pop_back_val();
+    assert(Ptr && !isa<GlobalValue>(Ptr) &&
+           "Null or GlobalValue should not be inserted");
+
+    for (const User *Us : Ptr->users()) {
+      auto *U = dyn_cast<Instruction>(Us);
+      if (!U || U == &I || !DT.dominates(U, MostDominatingInstruction))
+        continue;
+
+      // Add bitcasts and zero GEPs to queue.
+      if (isa<BitCastInst>(U)) {
+        PointerUsesQueue.push_back(U);
+        continue;
+      }
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+        if (GEP->hasAllZeroIndices())
+          PointerUsesQueue.push_back(U);
+        continue;
+      }
+
+      // If we hit a load/store with an invariant.group metadata and the same
+      // pointer operand, we can assume that value pointed to by the pointer
+      // operand didn't change.
+      if (U->hasMetadata(LLVMContext::MD_invariant_group) &&
+          getLoadStorePointerOperand(U) == Ptr && !U->isVolatile()) {
+        MostDominatingInstruction = U;
+      }
+    }
+  }
+  return MostDominatingInstruction == &I ? nullptr : MostDominatingInstruction;
+}
+
 template <typename AliasAnalysisType>
 MemoryAccess *
 MemorySSA::ClobberWalkerBase<AliasAnalysisType>::getClobberingMemoryAccessBase(
-    MemoryAccess *MA, unsigned &UpwardWalkLimit, bool SkipSelf) {
+    MemoryAccess *MA, unsigned &UpwardWalkLimit, bool SkipSelf,
+    bool UseInvariantGroup) {
  auto *StartingAccess = dyn_cast<MemoryUseOrDef>(MA);
  // If this is a MemoryPhi, we can't do anything.
  if (!StartingAccess)
    return MA;

+  if (UseInvariantGroup) {
+    if (auto *I = getInvariantGroupClobberingInstruction(
+            *StartingAccess->getMemoryInst(), MSSA->getDomTree())) {
+      assert(isa<LoadInst>(I) || isa<StoreInst>(I));
+
+      auto *ClobberMA = MSSA->getMemoryAccess(I);
+      assert(ClobberMA);
+      if (isa<MemoryUse>(ClobberMA))
+        return ClobberMA->getDefiningAccess();
+      return ClobberMA;
+    }
+  }
+
  bool IsOptimized = false;

  // If this is an already optimized use or def, return the optimized result.
--- a/llvm/test/Analysis/MemorySSA/invariant-groups.ll
+++ b/llvm/test/Analysis/MemorySSA/invariant-groups.ll
@ -1,11 +1,44 @@
 ; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa-walker>' -verify-memoryssa < %s 2>&1 | FileCheck %s
-;
-; Currently, MemorySSA doesn't support invariant groups. So, we should ignore
-; launder.invariant.group intrinsics entirely. We'll need to pay attention to
-; them when/if we decide to support invariant groups.

@g = external global i32

+; CHECK-LABEL: define {{.*}} @global(
+define i32 @global() {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i32 0
+  store i32 0, i32* @g, align 4, !invariant.group !0
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: call void @clobber
+  call void @clobber(i32* @g)
+
+; FIXME: this could be clobbered by 1 if we walked the instruction list for loads/stores to @g.
+; But we can't look at the uses of @g in a function analysis.
+; CHECK: MemoryUse(2) {{.*}} clobbered by 2
+; CHECK-NEXT: %1 = load i32
+  %1 = load i32, i32* @g, align 4, !invariant.group !0
+  ret i32 %1
+}
+
+; CHECK-LABEL: define {{.*}} @global2(
+define i32 @global2() {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i32 0
+  store i32 0, i32* inttoptr (i64 ptrtoint (i32* @g to i64) to i32*), align 4, !invariant.group !0
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: call void @clobber
+  call void @clobber(i32* inttoptr (i64 ptrtoint (i32* @g to i64) to i32*))
+
+; FIXME: this could be clobbered by 1 if we walked the instruction list for loads/stores to @g.
+; But we can't look at the uses of @g in a function analysis.
+; CHECK: MemoryUse(2) {{.*}} clobbered by 2
+; CHECK-NEXT: %1 = load i32
+  %1 = load i32, i32* inttoptr (i64 ptrtoint (i32* @g to i64) to i32*), align 4, !invariant.group !0
+  ret i32 %1
+}
+
+; CHECK-LABEL: define {{.*}} @foo(
 define i32 @foo(i32* %a) {
 ; CHECK: 1 = MemoryDef(liveOnEntry)
 ; CHECK-NEXT: store i32 0
@ -29,6 +62,41 @@ define i32 @foo(i32* %a) {
  ret i32 %2
 }

+; CHECK-LABEL: define {{.*}} @volatile1(
+define void @volatile1(i32* %a) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i32 0
+  store i32 0, i32* %a, align 4, !invariant.group !0
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: call void @clobber
+  call void @clobber(i32* %a)
+
+; CHECK: 3 = MemoryDef(2){{.*}} clobbered by 2
+; CHECK-NEXT: load volatile
+  %b = load volatile i32, i32* %a, align 4, !invariant.group !0
+
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}} @volatile2(
+define void @volatile2(i32* %a) {
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store volatile i32 0
+  store volatile i32 0, i32* %a, align 4, !invariant.group !0
+
+; CHECK: 2 = MemoryDef(1)
+; CHECK-NEXT: call void @clobber
+  call void @clobber(i32* %a)
+
+; CHECK: MemoryUse(2){{.*}} clobbered by 2
+; CHECK-NEXT: load i32
+  %b = load i32, i32* %a, align 4, !invariant.group !0
+
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}} @skipBarrier(
 define i32 @skipBarrier(i32* %a) {
 ; CHECK: 1 = MemoryDef(liveOnEntry)
 ; CHECK-NEXT: store i32 0
@ -47,6 +115,7 @@ define i32 @skipBarrier(i32* %a) {
  ret i32 %2
 }

+; CHECK-LABEL: define {{.*}} @skipBarrier2(
 define i32 @skipBarrier2(i32* %a) {

 ; CHECK: MemoryUse(liveOnEntry)
@ -67,8 +136,7 @@ define i32 @skipBarrier2(i32* %a) {
 ; CHECK-NEXT: store i32 1
  store i32 1, i32* @g, align 4

-; FIXME: based on invariant.group it should be MemoryUse(liveOnEntry)
-; CHECK: MemoryUse(2) {{.*}} clobbered by 2
+; CHECK: MemoryUse(2) {{.*}} clobbered by liveOnEntry
 ; CHECK-NEXT: %v3 = load i32
  %v3 = load i32, i32* %a32, align 4, !invariant.group !0
  %add = add nsw i32 %v2, %v3
@ -76,6 +144,7 @@ define i32 @skipBarrier2(i32* %a) {
  ret i32 %add2
 }

+; CHECK-LABEL: define {{.*}} @handleInvariantGroups(
 define i32 @handleInvariantGroups(i32* %a) {
 ; CHECK: 1 = MemoryDef(liveOnEntry)
 ; CHECK-NEXT: store i32 0
@ -98,14 +167,14 @@ define i32 @handleInvariantGroups(i32* %a) {
 ; CHECK-NEXT: store i32 2
  store i32 2, i32* @g, align 4

-; FIXME: This can be changed to MemoryUse(2)
-; CHECK: MemoryUse(4) {{.*}} clobbered by 4
+; CHECK: MemoryUse(4) {{.*}} clobbered by 2
 ; CHECK-NEXT: %3 = load i32
  %3 = load i32, i32* %a32, align 4, !invariant.group !0
  %add = add nsw i32 %2, %3
  ret i32 %add
 }

+; CHECK-LABEL: define {{.*}} @loop(
 define i32 @loop(i1 %a) {
 entry:
  %0 = alloca i32, align 4
@ -118,15 +187,13 @@ entry:
  br i1 %a, label %Loop.Body, label %Loop.End

 Loop.Body:
-; FIXME: MemoryUse(1)
-; CHECK: MemoryUse(2) {{.*}} clobbered by 2
+; CHECK: MemoryUse(2) {{.*}} clobbered by 1
 ; CHECK-NEXT: %1 = load i32
  %1 = load i32, i32* %0, !invariant.group !0
  br i1 %a, label %Loop.End, label %Loop.Body

 Loop.End:
-; FIXME: MemoryUse(1)
-; CHECK: MemoryUse(2) {{.*}} clobbered by 2
+; CHECK: MemoryUse(2) {{.*}} clobbered by 1
 ; CHECK-NEXT: %2 = load
  %2 = load i32, i32* %0, align 4, !invariant.group !0
  br i1 %a, label %Ret, label %Loop.Body
@ -135,6 +202,7 @@ Ret:
  ret i32 %2
 }

+; CHECK-LABEL: define {{.*}} @loop2(
 define i8 @loop2(i8* %p) {
 entry:
 ; CHECK: 1 = MemoryDef(liveOnEntry)
@ -154,8 +222,7 @@ Loop.Body:
 ; CHECK-NEXT: %0 = load i8
  %0 = load i8, i8* %after, !invariant.group !0

-; FIXME: MemoryUse(1)
-; CHECK: MemoryUse(6) {{.*}} clobbered by 6
+; CHECK: MemoryUse(6) {{.*}} clobbered by 1
 ; CHECK-NEXT: %1 = load i8
  %1 = load i8, i8* %p, !invariant.group !0

@ -169,8 +236,7 @@ Loop.End:
 ; CHECK-NEXT: %2 = load
  %2 = load i8, i8* %after, align 4, !invariant.group !0

-; FIXME: MemoryUse(1)
-; CHECK: MemoryUse(5) {{.*}} clobbered by 5
+; CHECK: MemoryUse(5) {{.*}} clobbered by 1
 ; CHECK-NEXT: %3 = load
  %3 = load i8, i8* %p, align 4, !invariant.group !0
  br i1 undef, label %Ret, label %Loop.Body
@ -180,6 +246,7 @@ Ret:
 }


+; CHECK-LABEL: define {{.*}} @loop3(
 define i8 @loop3(i8* %p) {
 entry:
 ; CHECK: 1 = MemoryDef(liveOnEntry)
@ -203,8 +270,7 @@ Loop.Body:
 ; CHECK-NEXT: call void @clobber8
  call void @clobber8(i8* %after)

-; FIXME: MemoryUse(8)
-; CHECK: MemoryUse(4) {{.*}} clobbered by 4
+; CHECK: MemoryUse(4) {{.*}} clobbered by 8
 ; CHECK-NEXT: %1 = load i8
  %1 = load i8, i8* %after, !invariant.group !0

@ -214,8 +280,7 @@ Loop.next:
 ; CHECK-NEXT: call void @clobber8
  call void @clobber8(i8* %after)

-; FIXME: MemoryUse(8)
-; CHECK: MemoryUse(5) {{.*}} clobbered by 5
+; CHECK: MemoryUse(5) {{.*}} clobbered by 8
 ; CHECK-NEXT: %2 = load i8
  %2 = load i8, i8* %after, !invariant.group !0

@ -230,8 +295,7 @@ Loop.End:
 ; CHECK-NEXT: call void @clobber8
  call void @clobber8(i8* %after)

-; FIXME: MemoryUse(7)
-; CHECK: MemoryUse(6) {{.*}} clobbered by 6
+; CHECK: MemoryUse(6) {{.*}} clobbered by 7
 ; CHECK-NEXT: %4 = load
  %4 = load i8, i8* %after, align 4, !invariant.group !0
  br i1 undef, label %Ret, label %Loop.Body
@ -240,6 +304,7 @@ Ret:
  ret i8 %3
 }

+; CHECK-LABEL: define {{.*}} @loop4(
 define i8 @loop4(i8* %p) {
 entry:
 ; CHECK: 1 = MemoryDef(liveOnEntry)
@ -263,8 +328,7 @@ Loop.Body:
 ; CHECK-NEXT: %1 = load i8
  %1 = load i8, i8* %after, !invariant.group !0

-; FIXME: MemoryUse(2)
-; CHECK: MemoryUse(6) {{.*}} clobbered by 6
+; CHECK: MemoryUse(6) {{.*}} clobbered by 1
 ; CHECK-NEXT: %2 = load i8
  %2 = load i8, i8* %p, !invariant.group !0

@ -277,8 +341,7 @@ Loop.End:
 ; CHECK-NEXT: %3 = load
  %3 = load i8, i8* %after, align 4, !invariant.group !0

-; FIXME: MemoryUse(2)
-; CHECK: MemoryUse(5) {{.*}} clobbered by 5
+; CHECK: MemoryUse(5) {{.*}} clobbered by 1
 ; CHECK-NEXT: %4 = load
  %4 = load i8, i8* %p, align 4, !invariant.group !0
  br i1 undef, label %Ret, label %Loop.Body
@ -288,7 +351,7 @@ Ret:
 }

 ; In the future we would like to CSE barriers if there is no clobber between.
-; CHECK-LABEL: define i8 @optimizable()
+; CHECK-LABEL: define {{.*}} @optimizable(
 define i8 @optimizable() {
 entry:
  %ptr = alloca i8
@ -318,7 +381,7 @@ entry:
  ret i8 %v
 }

-; CHECK-LABEL: define i8 @unoptimizable2()
+; CHECK-LABEL: define {{.*}} @unoptimizable2()
 define i8 @unoptimizable2() {
  %ptr = alloca i8
 ; CHECK: 1 = MemoryDef(liveOnEntry)
--- a/llvm/test/Transforms/NewGVN/invariant.group-xfail.ll
+++ b/llvm/test/Transforms/NewGVN/invariant.group-xfail.ll
@ -1,5 +1,4 @@
-; XFAIL: *
-; RUN: opt < %s -newgvn -S | FileCheck %s
+; RUN: opt < %s -passes=newgvn -S | FileCheck %s

 %struct.A = type { i32 (...)** }
@_ZTV1A = available_externally unnamed_addr constant [3 x i8*] [i8* null, i8* bitcast (i8** @_ZTI1A to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3fooEv to i8*)], align 8
@ -54,9 +53,6 @@ entry:

 ; CHECK-LABEL: define i1 @proveEqualityForStrip(
 define i1 @proveEqualityForStrip(i8* %a) {
-; FIXME: The first call could be also removed by GVN. Right now
-; DCE removes it. The second call is CSE'd with the first one.
-; CHECK: %b1 = call i8* @llvm.strip.invariant.group.p0i8(i8* %a)
  %b1 = call i8* @llvm.strip.invariant.group.p0i8(i8* %a)
 ; CHECK-NOT: llvm.strip.invariant.group
  %b2 = call i8* @llvm.strip.invariant.group.p0i8(i8* %a)
@ -76,6 +72,7 @@ entry:
    ret i8 %a
 }

+; NewGVN doesn't support assumes.
 ; CHECK-LABEL: define void @indirectLoads() {
 define void @indirectLoads() {
 entry:
@ -96,7 +93,7 @@ entry:
  %3 = load %struct.A*, %struct.A** %a, align 8
  %4 = bitcast %struct.A* %3 to void (%struct.A*)***

-; CHECK: call void @_ZN1A3fooEv(
+; FIXME: call void @_ZN1A3fooEv(
  %vtable1 = load void (%struct.A*)**, void (%struct.A*)*** %4, align 8, !invariant.group !0
  %vfn = getelementptr inbounds void (%struct.A*)*, void (%struct.A*)** %vtable1, i64 0
  %5 = load void (%struct.A*)*, void (%struct.A*)** %vfn, align 8
@ -104,7 +101,7 @@ entry:
  %6 = load %struct.A*, %struct.A** %a, align 8
  %7 = bitcast %struct.A* %6 to void (%struct.A*)***

-; CHECK: call void @_ZN1A3fooEv(
+; FIXME: call void @_ZN1A3fooEv(
  %vtable2 = load void (%struct.A*)**, void (%struct.A*)*** %7, align 8, !invariant.group !0
  %vfn3 = getelementptr inbounds void (%struct.A*)*, void (%struct.A*)** %vtable2, i64 0
  %8 = load void (%struct.A*)*, void (%struct.A*)** %vfn3, align 8
@ -116,19 +113,20 @@ entry:
  %vtable4 = load void (%struct.A*)**, void (%struct.A*)*** %10, align 8, !invariant.group !0
  %vfn5 = getelementptr inbounds void (%struct.A*)*, void (%struct.A*)** %vtable4, i64 0
  %11 = load void (%struct.A*)*, void (%struct.A*)** %vfn5, align 8
-; CHECK: call void @_ZN1A3fooEv(
+; FIXME: call void @_ZN1A3fooEv(
  call void %11(%struct.A* %9)
 
  %vtable5 = load i8**, i8*** %2, align 8, !invariant.group !0
  %vfn6 = getelementptr inbounds i8*, i8** %vtable5, i64 0
  %12 = bitcast i8** %vfn6 to void (%struct.A*)**
  %13 = load void (%struct.A*)*, void (%struct.A*)** %12, align 8
-; CHECK: call void @_ZN1A3fooEv(
+; FIXME: call void @_ZN1A3fooEv(
  call void %13(%struct.A* %9)
  
  ret void
 }

+; NewGVN won't CSE loads with different pointee types.
 ; CHECK-LABEL: define void @combiningBitCastWithLoad() {
 define void @combiningBitCastWithLoad() {
 entry:
@ -145,7 +143,7 @@ entry:
  %cmp.vtables = icmp eq i8** %vtable, getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2)
  
  store %struct.A* %1, %struct.A** %a, align 8
-; CHECK-NOT: !invariant.group
+; FIXME-NOT: !invariant.group
  %3 = load %struct.A*, %struct.A** %a, align 8
  %4 = bitcast %struct.A* %3 to void (%struct.A*)***

@ -163,7 +161,7 @@ enter:
  %ptr = alloca i8
  store i8 42, i8* %ptr
  call void @foo(i8* %ptr)
-; CHECK: %[[A:.*]] = load i8, i8* %ptr, !invariant.group
+; CHECK: %[[A:.*]] = load i8, i8* %ptr, align 1, !invariant.group
  %a = load i8, i8* %ptr, !invariant.group !0
 ; CHECK-NOT: load
  %b = load i8, i8* %ptr, !invariant.group !0
@ -180,7 +178,7 @@ enter:
  %ptr = alloca i8
  store i8 42, i8* %ptr
  call void @foo(i8* %ptr)
-; CHECK: %[[D:.*]] = load i8, i8* %ptr, !invariant.group
+; CHECK: %[[D:.*]] = load i8, i8* %ptr, align 1, !invariant.group
  %c = load i8, i8* %ptr
 ; CHECK-NOT: load
  %d = load i8, i8* %ptr, !invariant.group !0
@ -197,7 +195,7 @@ enter:
  %ptr = alloca i8
  store i8 42, i8* %ptr
  call void @foo(i8* %ptr)
-; CHECK: %[[E:.*]] = load i8, i8* %ptr, !invariant.group
+; CHECK: %[[E:.*]] = load i8, i8* %ptr, align 1, !invariant.group
  %e = load i8, i8* %ptr, !invariant.group !0
 ; CHECK-NOT: load
  %f = load i8, i8* %ptr
@ -214,7 +212,7 @@ enter:
  %ptr = alloca i8
  store i8 42, i8* %ptr
  call void @foo(i8* %ptr)
-; CHECK: %[[E:.*]] = load i8, i8* %ptr, !invariant.group
+; CHECK: %[[E:.*]] = load i8, i8* %ptr, align 1, !invariant.group
  %e = load i8, i8* %ptr, !invariant.group !0
 ; CHECK-NOT: load
  %f = load i8, i8* %ptr, !invariant.group !0
@ -251,16 +249,17 @@ entry:
    ret i8 %a
 }

+; NewGVN cares about the launder for some reason.
 ; CHECK-LABEL: define i8 @optimizable4() {
 define i8 @optimizable4() {
 entry:
    %ptr = alloca i8
-    store i8 42, i8* %ptr, !invariant.group !0
+    store i8 42, i8* %ptr
    %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr)
-; CHECK-NOT: load
-    %a = load i8, i8* %ptr2, !invariant.group !0
+; FIXME-NOT: load
+    %a = load i8, i8* %ptr2
    
-; CHECK: ret i8 42
+; FIXME: ret i8 42
    ret i8 %a
 }

@ -276,7 +275,7 @@ entry:
    call void @bar(i8 %b)

    %c = load volatile i8, i8* %ptr, !invariant.group !0
-; FIXME: we could change %c to 42, preserving volatile load
+; We might be able to optimize this, but nobody cares
 ; CHECK: call void @bar(i8 %c)
    call void @bar(i8 %c)
 ; CHECK: ret i8 42
@ -295,15 +294,15 @@ entry:
    call void @bar(i8 %b)

    %c = load volatile i8, i8* %ptr, !invariant.group !0
-; FIXME: we could change %c to 42, preserving volatile load
+; We might be able to optimize this, but nobody cares
 ; CHECK: call void @bar(i8 %c)
    call void @bar(i8 %c)
 ; CHECK: ret i8 42
    ret i8 %a
 }

-; CHECK-LABEL: define i8 @fun() {
-define i8 @fun() {
+; CHECK-LABEL: define void @fun() {
+define void @fun() {
 entry:
    %ptr = alloca i8
    store i8 42, i8* %ptr, !invariant.group !0
@ -313,23 +312,10 @@ entry:
 ; CHECK: call void @bar(i8 42)
    call void @bar(i8 %a)

-    %newPtr = call i8* @getPointer(i8* %ptr) 
-    %c = load i8, i8* %newPtr, !invariant.group !0 ; Can't assume anything, because we only have information about %ptr
-; CHECK: call void @bar(i8 %c)
-    call void @bar(i8 %c)
-    
-    %unknownValue = load i8, i8* @unknownPtr
-; FIXME: Can assume that %unknownValue == 42
-; CHECK: store i8 %unknownValue, i8* %ptr, !invariant.group !0
-    store i8 %unknownValue, i8* %ptr, !invariant.group !0 
-
-    %newPtr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr)
-; CHECK-NOT: load
-    %d = load i8, i8* %newPtr2, !invariant.group !0
-; CHECK: ret i8 %unknownValue
-    ret i8 %d
+    ret void
 }

+; FIXME: NewGVN doesn't run instsimplify on a load from a vtable definition?
 ; This test checks if invariant.group understands gep with zeros
 ; CHECK-LABEL: define void @testGEP0() {
 define void @testGEP0() {
@ -347,7 +333,7 @@ define void @testGEP0() {
  %6 = bitcast %struct.A* %a to void (%struct.A*)***
  %7 = load void (%struct.A*)**, void (%struct.A*)*** %6, align 8, !invariant.group !0
  %8 = load void (%struct.A*)*, void (%struct.A*)** %7, align 8
-; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+; FIXME: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
  call void %8(%struct.A* nonnull %a)
  br label %_Z1gR1A.exit

@ -360,10 +346,10 @@ _Z1gR1A.exit:                                     ; preds = %0, %5
 ; from the same function.
 ; CHECK-LABEL: define void @testGlobal() {
 define void @testGlobal() {
-; CHECK:  %a = load i8, i8* @unknownPtr, !invariant.group !0
+; CHECK:  %a = load i8, i8* @unknownPtr, align 1, !invariant.group !0
   %a = load i8, i8* @unknownPtr, !invariant.group !0
   call void @foo2(i8* @unknownPtr, i8 %a)
-; CHECK:  %1 = load i8, i8* @unknownPtr, !invariant.group !0
+; CHECK:  %1 = load i8, i8* @unknownPtr, align 1, !invariant.group !0
   %1 = load i8, i8* @unknownPtr, !invariant.group !0
   call void @bar(i8 %1)

@ -378,12 +364,14 @@ define void @testGlobal() {
   call void @fooBit(i1* %b0, i1 %3)
   ret void
 }
-; And in the case it is not global
-; CHECK-LABEL: define void @testNotGlobal() {
-define void @testNotGlobal() {
+
+; Might be similar to above where NewGVN doesn't handle loads of different types from the same location.
+; Not super important anyway.
+; CHECK-LABEL: define void @testTrunc() {
+define void @testTrunc() {
   %a = alloca i8
   call void @foo(i8* %a)
-; CHECK:  %b = load i8, i8* %a, !invariant.group !0
+; CHECK:  %b = load i8, i8* %a, align 1, !invariant.group !0
   %b = load i8, i8* %a, !invariant.group !0
   call void @foo2(i8* %a, i8 %b)

@ -393,16 +381,17 @@ define void @testNotGlobal() {

   %b0 = bitcast i8* %a to i1*
   call void @fooBit(i1* %b0, i1 1)
-; CHECK: %1 = trunc i8 %b to i1
+; FIXME: %1 = trunc i8 %b to i1
   %2 = load i1, i1* %b0, !invariant.group !0
-; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %1)
+; FIXME-NEXT: call void @fooBit(i1* %b0, i1 %1)
   call void @fooBit(i1* %b0, i1 %2)
   %3 = load i1, i1* %b0, !invariant.group !0
-; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %1)
+; FIXME-NEXT: call void @fooBit(i1* %b0, i1 %1)
   call void @fooBit(i1* %b0, i1 %3)
   ret void
 }

+; See comment in @testGEP0 on what NewGVN is lacking.
 ; CHECK-LABEL: define void @handling_loops()
 define void @handling_loops() {
  %a = alloca %struct.A, align 8
@ -426,9 +415,9 @@ define void @handling_loops() {
  %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
  %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
  %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
-  ; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+  ; FIXME: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
  call void %9(%struct.A* nonnull %a) #3
-  ; CHECK-NOT: call void %
+  ; FIXME-NOT: call void %
  %10 = add nuw nsw i8 %8, 1
  %11 = load i8, i8* @unknownPtr, align 4
  %12 = icmp slt i8 %10, %11
@ -451,10 +440,11 @@ declare void @_ZN1AC1Ev(%struct.A*)
 declare void @fooBit(i1*, i1)

 declare i8* @llvm.launder.invariant.group.p0i8(i8*)
+declare i8* @llvm.strip.invariant.group.p0i8(i8*)

 ; Function Attrs: nounwind
 declare void @llvm.assume(i1 %cmp.vtables) #0


 attributes #0 = { nounwind }
-!0 = !{}
+!0 = !{}
--- a/llvm/unittests/Analysis/MemorySSATest.cpp
+++ b/llvm/unittests/Analysis/MemorySSATest.cpp
@ -1725,4 +1725,50 @@ TEST_F(MemorySSATest, TestLoopInvariantEntryBlockPointer) {
      EXPECT_TRUE(ItB->second.Size.getValue() == 8);
    }
  }
-}
+}
+
+TEST_F(MemorySSATest, TestInvariantGroup) {
+  SMDiagnostic E;
+  auto M = parseAssemblyString("declare void @f(i8*)\n"
+                               "define i8 @test(i8* %p) {\n"
+                               "entry:\n"
+                               "  store i8 42, i8* %p, !invariant.group !0\n"
+                               "  call void @f(i8* %p)\n"
+                               "  %v = load i8, i8* %p, !invariant.group !0\n"
+                               "  ret i8 %v\n"
+                               "}\n"
+                               "!0 = !{}",
+                               E, C);
+  ASSERT_TRUE(M);
+  F = M->getFunction("test");
+  ASSERT_TRUE(F);
+  setupAnalyses();
+  MemorySSA &MSSA = *Analyses->MSSA;
+  MemorySSAWalker *Walker = Analyses->Walker;
+
+  auto &BB = F->getEntryBlock();
+  auto &SI = cast<StoreInst>(*BB.begin());
+  auto &Call = cast<CallBase>(*std::next(BB.begin()));
+  auto &LI = cast<LoadInst>(*std::next(std::next(BB.begin())));
+
+  {
+    MemoryAccess *SAccess = MSSA.getMemoryAccess(&SI);
+    MemoryAccess *LAccess = MSSA.getMemoryAccess(&LI);
+    MemoryAccess *SClobber = Walker->getClobberingMemoryAccess(SAccess);
+    EXPECT_TRUE(MSSA.isLiveOnEntryDef(SClobber));
+    MemoryAccess *LClobber = Walker->getClobberingMemoryAccess(LAccess);
+    EXPECT_EQ(SAccess, LClobber);
+  }
+
+  // remove store and verify that the memory accesses still make sense
+  MemorySSAUpdater Updater(&MSSA);
+  Updater.removeMemoryAccess(&SI);
+  SI.eraseFromParent();
+
+  {
+    MemoryAccess *CallAccess = MSSA.getMemoryAccess(&Call);
+    MemoryAccess *LAccess = MSSA.getMemoryAccess(&LI);
+    MemoryAccess *LClobber = Walker->getClobberingMemoryAccess(LAccess);
+    EXPECT_EQ(CallAccess, LClobber);
+  }
+}