Title: Loop Cache Analysis
Summary: Implement a new analysis to estimate the number of cache lines
required by a loop nest.
The analysis is largely based on the following paper:
Compiler Optimizations for Improving Data Locality
By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
The analysis considers temporal reuse (accesses to the same memory
location) and spatial reuse (accesses to memory locations within a cache
line). For simplicity the analysis considers memory accesses in the
innermost loop in a loop nest, and thus determines the number of cache
lines used when the loop L in loop nest LN is placed in the innermost
position.
The result of the analysis can be used to drive several transformations.
As an example, loop interchange could use it determine which loops in a
perfect loop nest should be interchanged to maximize cache reuse.
Similarly, loop distribution could be enhanced to take into
consideration cache reuse between arrays when distributing a loop to
eliminate vectorization inhibiting dependencies.
The general approach taken to estimate the number of cache lines used by
the memory references in the inner loop of a loop nest is:
Partition memory references that exhibit temporal or spatial reuse into
reference groups.
For each loop L in the a loop nest LN: a. Compute the cost of the
reference group b. Compute the 'cache cost' of the loop nest by summing
up the reference groups costs
For further details of the algorithm please refer to the paper.
Authored By: etiotto
Reviewers: hfinkel, Meinersbur, jdoerfert, kbarton, bmahjour, anemet,
fhahn
Reviewed By: Meinersbur
Subscribers: reames, nemanjai, MaskRay, wuzish, Hahnfeld, xusx595,
venkataramanan.kumar.llvm, greened, dmgreen, steleman, fhahn, xblvaOO,
Whitney, mgorny, hiraditya, mgrang, jsji, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D63459
llvm-svn: 368439
2019-08-09 21:56:29 +08:00
|
|
|
; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
|
|
|
|
|
|
|
|
target datalayout = "e-m:e-i64:64-n32:64"
|
|
|
|
target triple = "powerpc64le-unknown-linux-gnu"
|
|
|
|
|
|
|
|
; void matmul(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) {
|
|
|
|
; for (long i = 0; i < n; i++)
|
|
|
|
; for (long j = 0; j < m; j++)
|
|
|
|
; for (long k = 0; k < o; k++)
|
|
|
|
; C[i][j] = C[i][j] + A[i][k] * B[k][j];
|
|
|
|
; }
|
|
|
|
|
2019-08-10 00:18:22 +08:00
|
|
|
; CHECK-DAG:Loop 'for.i' has cost = 2010000
|
|
|
|
; CHECK-DAG:Loop 'for.k' has cost = 1040000
|
|
|
|
; CHECK-DAG:Loop 'for.j' has cost = 70000
|
Title: Loop Cache Analysis
Summary: Implement a new analysis to estimate the number of cache lines
required by a loop nest.
The analysis is largely based on the following paper:
Compiler Optimizations for Improving Data Locality
By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
The analysis considers temporal reuse (accesses to the same memory
location) and spatial reuse (accesses to memory locations within a cache
line). For simplicity the analysis considers memory accesses in the
innermost loop in a loop nest, and thus determines the number of cache
lines used when the loop L in loop nest LN is placed in the innermost
position.
The result of the analysis can be used to drive several transformations.
As an example, loop interchange could use it determine which loops in a
perfect loop nest should be interchanged to maximize cache reuse.
Similarly, loop distribution could be enhanced to take into
consideration cache reuse between arrays when distributing a loop to
eliminate vectorization inhibiting dependencies.
The general approach taken to estimate the number of cache lines used by
the memory references in the inner loop of a loop nest is:
Partition memory references that exhibit temporal or spatial reuse into
reference groups.
For each loop L in the a loop nest LN: a. Compute the cost of the
reference group b. Compute the 'cache cost' of the loop nest by summing
up the reference groups costs
For further details of the algorithm please refer to the paper.
Authored By: etiotto
Reviewers: hfinkel, Meinersbur, jdoerfert, kbarton, bmahjour, anemet,
fhahn
Reviewed By: Meinersbur
Subscribers: reames, nemanjai, MaskRay, wuzish, Hahnfeld, xusx595,
venkataramanan.kumar.llvm, greened, dmgreen, steleman, fhahn, xblvaOO,
Whitney, mgorny, hiraditya, mgrang, jsji, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D63459
llvm-svn: 368439
2019-08-09 21:56:29 +08:00
|
|
|
|
|
|
|
define void @matmul(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) {
|
|
|
|
entry:
|
|
|
|
br label %for.i
|
|
|
|
|
|
|
|
for.i: ; preds = %entry, %for.inc.i
|
|
|
|
%i = phi i64 [ 0, %entry ], [ %i.next, %for.inc.i ]
|
|
|
|
%muli = mul i64 %i, %m
|
|
|
|
br label %for.j
|
|
|
|
|
|
|
|
for.j: ; preds = %for.i, %for.inc.j
|
|
|
|
%j = phi i64 [ 0, %for.i ], [ %j.next, %for.inc.j ]
|
|
|
|
%addj = add i64 %muli, %j
|
|
|
|
%mulj = mul i64 %addj, %o
|
|
|
|
br label %for.k
|
|
|
|
|
|
|
|
for.k: ; preds = %for.j, %for.inc.k
|
|
|
|
%k = phi i64 [ 0, %for.j ], [ %k.next, %for.inc.k ]
|
|
|
|
|
|
|
|
; A[i][k]
|
|
|
|
%arrayidx3 = add i64 %k, %muli
|
|
|
|
%arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %arrayidx3
|
|
|
|
%elem_A = load i32, i32* %arrayidx4, align 4
|
|
|
|
|
|
|
|
; B[k][j]
|
|
|
|
%mulk = mul i64 %k, %o
|
|
|
|
%arrayidx5 = add i64 %j, %mulk
|
|
|
|
%arrayidx6 = getelementptr inbounds i32, i32* %B, i64 %arrayidx5
|
|
|
|
%elem_B = load i32, i32* %arrayidx6, align 4
|
|
|
|
|
|
|
|
; C[i][k]
|
|
|
|
%arrayidx7 = add i64 %j, %muli
|
|
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %C, i64 %arrayidx7
|
|
|
|
%elem_C = load i32, i32* %arrayidx8, align 4
|
|
|
|
|
|
|
|
; C[i][j] = C[i][j] + A[i][k] * B[k][j];
|
|
|
|
%mul = mul nsw i32 %elem_A, %elem_B
|
|
|
|
%add = add nsw i32 %elem_C, %mul
|
|
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
|
|
|
|
|
|
br label %for.inc.k
|
|
|
|
|
|
|
|
for.inc.k: ; preds = %for.k
|
|
|
|
%k.next = add nuw nsw i64 %k, 1
|
|
|
|
%exitcond = icmp ne i64 %k.next, %o
|
|
|
|
br i1 %exitcond, label %for.k, label %for.end
|
|
|
|
|
|
|
|
for.end: ; preds = %for.inc
|
|
|
|
br label %for.inc.j
|
|
|
|
|
|
|
|
for.inc.j: ; preds = %for.end
|
|
|
|
%j.next = add nuw nsw i64 %j, 1
|
|
|
|
%exitcond5 = icmp ne i64 %j.next, %m
|
|
|
|
br i1 %exitcond5, label %for.j, label %for.end23
|
|
|
|
|
|
|
|
for.end23: ; preds = %for.inc.j
|
|
|
|
br label %for.inc.i
|
|
|
|
|
|
|
|
for.inc.i: ; preds = %for.end23
|
|
|
|
%i.next = add nuw nsw i64 %i, 1
|
|
|
|
%exitcond8 = icmp ne i64 %i.next, %n
|
|
|
|
br i1 %exitcond8, label %for.i, label %for.end26
|
|
|
|
|
|
|
|
for.end26: ; preds = %for.inc.i
|
|
|
|
ret void
|
|
|
|
}
|