[LoadStoreVectorizer] Change VectorSet to Vector to match head and tail positions. Resolves PR29148.

Summary:
LSV was using two vector sets (heads and tails) to track pairs of adjiacent position to vectorize.
A recent optimization is trying to obtain the longest chain to vectorize and assumes the positions
in heads(H) and tails(T) match, which is not the case is there are multiple tails for the same head.

e.g.:
i1: store a[0]
i2: store a[1]
i3: store a[1]
Leads to:
H: i1
T: i2 i3
Instead of:
H: i1 i1
T: i2 i3
So the positions for instructions that follow i3 will have different indexes in H/T.
This patch resolves PR29148.

This issue also surfaced the fact that if the chain is too long, and TLI
returns a "not-fast" answer, the whole chain will be abandoned for
vectorization, even though a smaller one would be beneficial.
Added a testcase and FIXME for this.

Reviewers: tstellarAMD, arsenm, jlebar

Subscribers: mzolotukhin, wdng, llvm-commits

Differential Revision: https://reviews.llvm.org/D24057

llvm-svn: 280179
This commit is contained in:
Alina Sbirlea 2016-08-30 23:53:59 +00:00
parent fdb32d566a
commit 3f8f7840bf
3 changed files with 101 additions and 7 deletions

View File

@ -628,7 +628,7 @@ bool Vectorizer::vectorizeChains(InstrListMap &Map) {
bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
SmallSetVector<int, 16> Heads, Tails;
SmallVector<int, 16> Heads, Tails;
int ConsecutiveChain[64];
// Do a quadratic search on all of the given stores and find all of the pairs
@ -647,8 +647,8 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
continue; // Should not insert.
}
Tails.insert(j);
Heads.insert(i);
Tails.push_back(j);
Heads.push_back(i);
ConsecutiveChain[i] = j;
}
}
@ -660,21 +660,21 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
for (int Head : Heads) {
if (InstructionsProcessed.count(Instrs[Head]))
continue;
bool longerChainExists = false;
bool LongerChainExists = false;
for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
if (Head == Tails[TIt] &&
!InstructionsProcessed.count(Instrs[Heads[TIt]])) {
longerChainExists = true;
LongerChainExists = true;
break;
}
if (longerChainExists)
if (LongerChainExists)
continue;
// We found an instr that starts a chain. Now follow the chain and try to
// vectorize it.
SmallVector<Instruction *, 16> Operands;
int I = Head;
while (I != -1 && (Tails.count(I) || Heads.count(I))) {
while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
if (InstructionsProcessed.count(Instrs[I]))
break;

View File

@ -0,0 +1,64 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
; Checks that there is no crash when there are multiple tails
; for a the same head starting a chain.
@0 = internal addrspace(3) global [16384 x i32] undef
; CHECK-LABEL: @no_crash(
; CHECK: store <2 x i32> zeroinitializer
; CHECK: store i32 0
; CHECK: store i32 0
define void @no_crash(i32 %arg) {
%tmp2 = add i32 %arg, 14
%tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
%tmp4 = add i32 %arg, 15
%tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4
store i32 0, i32 addrspace(3)* %tmp3, align 4
store i32 0, i32 addrspace(3)* %tmp5, align 4
store i32 0, i32 addrspace(3)* %tmp5, align 4
store i32 0, i32 addrspace(3)* %tmp5, align 4
ret void
}
; Check adjiacent memory locations are properly matched and the
; longest chain vectorized
; CHECK-LABEL: @interleave_get_longest
; CHECK: load <2 x i32>
; CHECK: load i32
; CHECK: store <2 x i32> zeroinitializer
; CHECK: load i32
; CHECK: load <2 x i32>
; CHECK: load i32
; CHECK: load i32
define void @interleave_get_longest(i32 %arg) {
%a1 = add i32 %arg, 1
%a2 = add i32 %arg, 2
%a3 = add i32 %arg, 3
%a4 = add i32 %arg, 4
%tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg
%tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1
%tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2
%tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3
%tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4
%l1 = load i32, i32 addrspace(3)* %tmp2, align 4
%l2 = load i32, i32 addrspace(3)* %tmp1, align 4
store i32 0, i32 addrspace(3)* %tmp2, align 4
store i32 0, i32 addrspace(3)* %tmp1, align 4
%l3 = load i32, i32 addrspace(3)* %tmp2, align 4
%l4 = load i32, i32 addrspace(3)* %tmp3, align 4
%l5 = load i32, i32 addrspace(3)* %tmp4, align 4
%l6 = load i32, i32 addrspace(3)* %tmp5, align 4
%l7 = load i32, i32 addrspace(3)* %tmp5, align 4
%l8 = load i32, i32 addrspace(3)* %tmp5, align 4
ret void
}

View File

@ -85,3 +85,33 @@ define void @chain_prefix_suffix(i32* noalias %ptr) {
ret void
}
; FIXME: If the chain is too long and TLI says misaligned is not fast,
; then LSV fails to vectorize anything in that chain.
; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7.
; CHECK-LABEL: @interleave_get_longest
; CHECK: load <3 x i32>
; CHECK: load i32
; CHECK: store <2 x i32> zeroinitializer
; CHECK: load i32
; CHECK: load i32
; CHECK: load i32
define void @interleave_get_longest(i32* noalias %ptr) {
%tmp1 = getelementptr i32, i32* %ptr, i64 0
%tmp2 = getelementptr i32, i32* %ptr, i64 1
%tmp3 = getelementptr i32, i32* %ptr, i64 2
%tmp4 = getelementptr i32, i32* %ptr, i64 3
%l1 = load i32, i32* %tmp2, align 4
%l2 = load i32, i32* %tmp1, align 4
store i32 0, i32* %tmp2, align 4
store i32 0, i32* %tmp1, align 4
%l3 = load i32, i32* %tmp2, align 4
%l4 = load i32, i32* %tmp3, align 4
%l5 = load i32, i32* %tmp4, align 4
%l6 = load i32, i32* %tmp4, align 4
%l7 = load i32, i32* %tmp4, align 4
ret void
}