[LoadStoreVectorizer] Use getMinusScev() to compute the distance between two pointers.

Summary: Currently, isConsecutiveAccess() detects two pointers(PtrA and PtrB) as consecutive by comparing PtrB with BaseDelta+PtrA. This works when both pointers are factorized or both of them are not factorized. But isConsecutiveAccess() fails if one of the pointers is factorized but the other one is not. Here is an example: PtrA = 4 * (A + B) PtrB = 4 + 4A + 4B This patch uses getMinusSCEV() to compute the distance between two pointers. getMinusSCEV() allows combining the expressions and computing the simplified distance. Author: FarhanaAleen Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D49516 llvm-svn: 337471
2018-07-19 16:50:27 +00:00 · 2018-07-19 16:50:27 +00:00 · 8c7a30baea
parent d1cf276621
commit 8c7a30baea
2 changed files with 57 additions and 0 deletions
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@ -340,6 +340,14 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
  if (X == PtrSCEVB)
    return true;

+  // The above check will not catch the cases where one of the pointers is
+  // factorized but the other one is not, such as (C + (S * (A + B))) vs
+  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+  // and getting the simplified difference.
+  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+  if (C == Dist)
+    return true;
+
  // Sometimes even this doesn't work, because SCEV can't always see through
  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
  // things the hard way.
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
@ -0,0 +1,49 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+declare i64 @_Z12get_local_idj(i32)
+
+declare i64 @_Z12get_group_idj(i32)
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; CHECK-LABEL: @factorizedVsNonfactorizedAccess(
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) {
+entry:
+  %call = tail call i64 @_Z12get_local_idj(i32 0)
+  %call1 = tail call i64 @_Z12get_group_idj(i32 0)
+  %div = lshr i64 %call, 4
+  %div2 = lshr i64 %call1, 3
+  %mul = shl i64 %div2, 7
+  %rem = shl i64 %call, 3
+  %mul3 = and i64 %rem, 120
+  %add = or i64 %mul, %mul3
+  %rem4 = shl i64 %call1, 7
+  %mul5 = and i64 %rem4, 896
+  %mul6 = shl nuw nsw i64 %div, 3
+  %add7 = add nuw i64 %mul5, %mul6
+  %mul9 = shl i64 %add7, 10
+  %add10 = add i64 %mul9, %add
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10
+  %load1 = load float, float addrspace(1)* %arrayidx, align 4
+  %conv = fpext float %load1 to double
+  %mul11 = fmul double %conv, 0x3FEAB481D8F35506
+  %conv12 = fptrunc double %mul11 to float
+  %conv18 = fpext float %conv12 to double
+  %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18)
+  %cstoreval1 = fptrunc double %storeval1 to float
+  store float %cstoreval1, float addrspace(1)* %arrayidx, align 4
+
+  %add23 = or i64 %add10, 1
+  %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23
+  %load2 = load float, float addrspace(1)* %arrayidx24, align 4
+  %conv25 = fpext float %load2 to double
+  %mul26 = fmul double %conv25, 0x3FEAB481D8F35506
+  %conv27 = fptrunc double %mul26 to float
+  %conv34 = fpext float %conv27 to double
+  %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34)
+  %cstoreval2 = fptrunc double %storeval2 to float
+  store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
+  ret void
+}