[LoadStoreVectorizer] Use getMinusScev() to compute the distance between two pointers.

Summary: Currently, isConsecutiveAccess() detects two pointers(PtrA and PtrB) as consecutive by
         comparing PtrB with BaseDelta+PtrA. This works when both pointers are factorized or
         both of them are not factorized. But isConsecutiveAccess() fails if one of the
         pointers is factorized but the other one is not.

         Here is an example:
         PtrA = 4 * (A + B)
         PtrB = 4 + 4A + 4B

         This patch uses getMinusSCEV() to compute the distance between two pointers.
         getMinusSCEV() allows combining the expressions and computing the simplified distance.

Author: FarhanaAleen

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D49516

llvm-svn: 337471
This commit is contained in:
Farhana Aleen 2018-07-19 16:50:27 +00:00
parent d1cf276621
commit 8c7a30baea
2 changed files with 57 additions and 0 deletions

View File

@ -340,6 +340,14 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
if (X == PtrSCEVB)
return true;
// The above check will not catch the cases where one of the pointers is
// factorized but the other one is not, such as (C + (S * (A + B))) vs
// (AS + BS). Get the minus scev. That will allow re-combining the expresions
// and getting the simplified difference.
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
if (C == Dist)
return true;
// Sometimes even this doesn't work, because SCEV can't always see through
// patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
// things the hard way.

View File

@ -0,0 +1,49 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
declare i64 @_Z12get_local_idj(i32)
declare i64 @_Z12get_group_idj(i32)
declare double @llvm.fmuladd.f64(double, double, double)
; CHECK-LABEL: @factorizedVsNonfactorizedAccess(
; CHECK: load <2 x float>
; CHECK: store <2 x float>
define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) {
entry:
%call = tail call i64 @_Z12get_local_idj(i32 0)
%call1 = tail call i64 @_Z12get_group_idj(i32 0)
%div = lshr i64 %call, 4
%div2 = lshr i64 %call1, 3
%mul = shl i64 %div2, 7
%rem = shl i64 %call, 3
%mul3 = and i64 %rem, 120
%add = or i64 %mul, %mul3
%rem4 = shl i64 %call1, 7
%mul5 = and i64 %rem4, 896
%mul6 = shl nuw nsw i64 %div, 3
%add7 = add nuw i64 %mul5, %mul6
%mul9 = shl i64 %add7, 10
%add10 = add i64 %mul9, %add
%arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10
%load1 = load float, float addrspace(1)* %arrayidx, align 4
%conv = fpext float %load1 to double
%mul11 = fmul double %conv, 0x3FEAB481D8F35506
%conv12 = fptrunc double %mul11 to float
%conv18 = fpext float %conv12 to double
%storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18)
%cstoreval1 = fptrunc double %storeval1 to float
store float %cstoreval1, float addrspace(1)* %arrayidx, align 4
%add23 = or i64 %add10, 1
%arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23
%load2 = load float, float addrspace(1)* %arrayidx24, align 4
%conv25 = fpext float %load2 to double
%mul26 = fmul double %conv25, 0x3FEAB481D8F35506
%conv27 = fptrunc double %mul26 to float
%conv34 = fpext float %conv27 to double
%storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34)
%cstoreval2 = fptrunc double %storeval2 to float
store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
ret void
}