From f29645e7afdbb8d1fc2dd603c0b128bac055625c Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 1 Oct 2020 19:17:21 -0700
Subject: [PATCH] [gvn] Handle a corner case w/vectors of non-integral pointers

If we try to coerce a vector of non-integral pointers to a narrower type (either narrower vector or single pointer), we use inttoptr and violate the semantics of non-integral pointers.  In theory, we can handle many of these cases, we just need to use a different code idiom to convert without going through inttoptr and back.

This shows up as wrong code bugs, and in some cases, crashes due to failed asserts.  Modeled after a change which has lived downstream for a couple years, though completely rewritten to be more idiomatic.
---
 llvm/lib/Transforms/Utils/VNCoercion.cpp      | 23 ++++----
 .../Transforms/GVN/non-integral-pointers.ll   | 52 +++++++++++++++++--
 2 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 1939c0e3b504..11b42eca4960 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -17,6 +17,7 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
                                      const DataLayout &DL) {
   Type *StoredTy = StoredVal->getType();
+
   if (StoredTy == LoadTy)
     return true;
 
@@ -46,6 +47,14 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
       return CI->isNullValue();
     return false;
   }
+
+
+  // The implementation below uses inttoptr for vectors of unequal size; we
+  // can't allow this for non integral pointers.  Wecould teach it to extract
+  // exact subvectors if desired. 
+  if (DL.isNonIntegralPointerType(StoredTy->getScalarType()) &&
+      StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedSize())
+    return false;
   
   return true;
 }
@@ -223,14 +232,8 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
   if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
     return -1;
 
-  // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
-    // Allow casts of zero values to null as a special case
-    auto *CI = dyn_cast<Constant>(StoredVal);
-    if (!CI || !CI->isNullValue())
-      return -1;
-  }
+  if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
+    return -1;
 
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =
@@ -333,9 +336,7 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
     return -1;
 
-  // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(DepLI->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType()))
+  if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
diff --git a/llvm/test/Transforms/GVN/non-integral-pointers.ll b/llvm/test/Transforms/GVN/non-integral-pointers.ll
index a017dda926e3..872b6648084e 100644
--- a/llvm/test/Transforms/GVN/non-integral-pointers.ll
+++ b/llvm/test/Transforms/GVN/non-integral-pointers.ll
@@ -202,7 +202,7 @@ define i64 addrspace(4)* @neg_forward_memcopy2(i64 addrspace(4)* addrspace(4)* %
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i64 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
 ; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 8, i1 false)
-; CHECK-NEXT:    [[REF:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* [[LOC]]
+; CHECK-NEXT:    [[REF:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* [[LOC]], align 8
 ; CHECK-NEXT:    ret i64 addrspace(4)* [[REF]]
 ;
 entry:
@@ -219,7 +219,7 @@ define i8 addrspace(4)* @forward_memcopy(i8 addrspace(4)* addrspace(4)* %loc) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
 ; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 8, i1 false)
-; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], align 8
 ; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
 ;
 entry:
@@ -266,7 +266,7 @@ define <4 x i64 addrspace(4)*> @neg_forward_memcpy_vload2(<4 x i64 addrspace(4)*
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
 ; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 32, i1 false)
-; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* [[LOC]]
+; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* [[LOC]], align 32
 ; CHECK-NEXT:    ret <4 x i64 addrspace(4)*> [[REF]]
 ;
 entry:
@@ -282,7 +282,7 @@ define <4 x i64> @neg_forward_memcpy_vload3(<4 x i64> addrspace(4)* %loc) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
 ; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 32, i1 false)
-; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64>, <4 x i64> addrspace(4)* [[LOC]]
+; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64>, <4 x i64> addrspace(4)* [[LOC]], align 32
 ; CHECK-NEXT:    ret <4 x i64> [[REF]]
 ;
 entry:
@@ -386,3 +386,47 @@ entry:
   %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc.off
   ret i8 addrspace(4)* %ref
 }
+
+
+define void @smaller_vector(i8* %p) {
+; CHECK-LABEL: @smaller_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = bitcast i8* [[P:%.*]] to <4 x i64 addrspace(4)*>*
+; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[P]] to <2 x i64 addrspace(4)*>*
+; CHECK-NEXT:    [[V4:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* [[A]], align 32
+; CHECK-NEXT:    [[V2:%.*]] = load <2 x i64 addrspace(4)*>, <2 x i64 addrspace(4)*>* [[B]], align 32
+; CHECK-NEXT:    call void @use.v2(<2 x i64 addrspace(4)*> [[V2]])
+; CHECK-NEXT:    call void @use.v4(<4 x i64 addrspace(4)*> [[V4]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = bitcast i8* %p to <4 x i64 addrspace(4)*>*
+  %b = bitcast i8* %p to <2 x i64 addrspace(4)*>*
+  %v4 = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* %a, align 32
+  %v2 = load <2 x i64 addrspace(4)*>, <2 x i64 addrspace(4)*>* %b, align 32
+  call void @use.v2(<2 x i64 addrspace(4)*> %v2)
+  call void @use.v4(<4 x i64 addrspace(4)*> %v4)
+  ret void
+}
+
+define i64 addrspace(4)* @vector_extract(i8* %p) {
+; CHECK-LABEL: @vector_extract(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = bitcast i8* [[P:%.*]] to <4 x i64 addrspace(4)*>*
+; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[P]] to i64 addrspace(4)**
+; CHECK-NEXT:    [[V4:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* [[A]], align 32
+; CHECK-NEXT:    [[RES:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[B]], align 32
+; CHECK-NEXT:    call void @use.v4(<4 x i64 addrspace(4)*> [[V4]])
+; CHECK-NEXT:    ret i64 addrspace(4)* [[RES]]
+;
+entry:
+  %a = bitcast i8* %p to <4 x i64 addrspace(4)*>*
+  %b = bitcast i8* %p to i64 addrspace(4)**
+  %v4 = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* %a, align 32
+  %res = load i64 addrspace(4)*, i64 addrspace(4)** %b, align 32
+  call void @use.v4(<4 x i64 addrspace(4)*> %v4)
+  ret i64 addrspace(4)* %res
+}
+
+declare void @use.v2(<2 x i64 addrspace(4)*>)
+declare void @use.v4(<4 x i64 addrspace(4)*>)