llvm-project/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll

; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s

target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"

; void foo(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) {
;   for (long i = 0; i < n; i++)
;     for (long j = 0; j < m; j++) {
;         A[i][j] = A[i][j+1] + B[i-1][j] + B[i+1][j+1] + C[i];
;         A[i][j] += B[i][i];
;     }   
; }

; CHECK-DAG: Loop 'for.i' has cost = 20600
; CHECK-DAG: Loop 'for.j' has cost = 800

define void @foo(i64 %n, i64 %m, i32* %A, i32* %B, i32* %C) {
entry:
  %cmp32 = icmp sgt i64 %n, 0
  %cmp230 = icmp sgt i64 %m, 0
  br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end

for.cond1.preheader.lr.ph:                        ; preds = %entry
  br i1 %cmp230, label %for.i.preheader, label %for.end

for.i.preheader:                                  ; preds = %for.cond1.preheader.lr.ph
  br label %for.i

for.i:                                            ; preds = %for.inci, %for.i.preheader.split
  %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader ]
  %subione = sub i64 %i, 1
  %addione = add i64 %i, 1
  %muli = mul i64 %i, %m
  %muliminusone = mul i64 %subione, %m
  %muliplusone = mul i64 %addione, %m
  br label %for.j

for.j:                                            ; preds = %for.incj, %for.i
  %j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ]
  %addj = add i64 %muli, %j

  ; B[i-1][j]
  %arrayidx1 = add i64 %j, %muliminusone
  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %arrayidx1
  %elem_B1 = load i32, i32* %arrayidx2, align 4

  ; B[i-1][j+1]
  %addjone = add i64 %j, 1
  %arrayidx3 = add i64 %addjone, %muliminusone
  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %arrayidx3
  %elem_B2 = load i32, i32* %arrayidx4, align 4

  ; C[i]
  %arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %i
  %elem_C = load i32, i32* %arrayidx6, align 4

  ; A[i][j+1]
  %arrayidx7 = add i64 %addjone, %muli
  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %arrayidx7
  %elem_A = load i32, i32* %arrayidx8, align 4

  ; A[i][j] = A[i][j+1] + B[i-1][j] + B[i-1][j+1] + C[i]
  %addB = add i32 %elem_B1, %elem_B2
  %addC = add i32 %addB, %elem_C
  %addA = add i32 %elem_A, %elem_C
  %arrayidx9 = add i64 %j, %muli
  %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 %arrayidx9
  store i32 %addA, i32* %arrayidx10, align 4

  ; A[i][j] += B[i][i];
  %arrayidx11 = add i64 %j, %muli
  %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 %arrayidx11
  %elem_A1 = load i32, i32* %arrayidx12, align 4
  %arrayidx13 = add i64 %i, %muli
  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 %arrayidx13
  %elem_B3 = load i32, i32* %arrayidx14, align 4
  %addA1 = add i32 %elem_A1, %elem_B3
  store i32 %addA1, i32* %arrayidx12, align 4

  br label %for.incj

for.incj:                                         ; preds = %for.j
  %incj = add nsw i64 %j, 1
  %exitcond54.us = icmp eq i64 %incj, %m
  br i1 %exitcond54.us, label %for.inci, label %for.j

for.inci:                                         ; preds = %for.incj
  %inci = add nsw i64 %i, 1
  %exitcond55.us = icmp eq i64 %inci, %n
  br i1 %exitcond55.us, label %for.end.loopexit, label %for.i

for.end.loopexit:                                 ; preds = %for.inci
  br label %for.end

for.end:                                          ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
  ret void
}
Title: Loop Cache Analysis Summary: Implement a new analysis to estimate the number of cache lines required by a loop nest. The analysis is largely based on the following paper: Compiler Optimizations for Improving Data Locality By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf The analysis considers temporal reuse (accesses to the same memory location) and spatial reuse (accesses to memory locations within a cache line). For simplicity the analysis considers memory accesses in the innermost loop in a loop nest, and thus determines the number of cache lines used when the loop L in loop nest LN is placed in the innermost position. The result of the analysis can be used to drive several transformations. As an example, loop interchange could use it determine which loops in a perfect loop nest should be interchanged to maximize cache reuse. Similarly, loop distribution could be enhanced to take into consideration cache reuse between arrays when distributing a loop to eliminate vectorization inhibiting dependencies. The general approach taken to estimate the number of cache lines used by the memory references in the inner loop of a loop nest is: Partition memory references that exhibit temporal or spatial reuse into reference groups. For each loop L in the a loop nest LN: a. Compute the cost of the reference group b. Compute the 'cache cost' of the loop nest by summing up the reference groups costs For further details of the algorithm please refer to the paper. Authored By: etiotto Reviewers: hfinkel, Meinersbur, jdoerfert, kbarton, bmahjour, anemet, fhahn Reviewed By: Meinersbur Subscribers: reames, nemanjai, MaskRay, wuzish, Hahnfeld, xusx595, venkataramanan.kumar.llvm, greened, dmgreen, steleman, fhahn, xblvaOO, Whitney, mgorny, hiraditya, mgrang, jsji, llvm-commits Tag: LLVM Differential Revision: https://reviews.llvm.org/D63459 llvm-svn: 368439 2019-08-09 21:56:29 +08:00			`; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 \| FileCheck %s`

			`target datalayout = "e-m:e-i64:64-n32:64"`
			`target triple = "powerpc64le-unknown-linux-gnu"`

			`; void foo(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) {`
			`; for (long i = 0; i < n; i++)`
			`; for (long j = 0; j < m; j++) {`
			`; A[i][j] = A[i][j+1] + B[i-1][j] + B[i+1][j+1] + C[i];`
			`; A[i][j] += B[i][i];`
			`; }`
			`; }`

Title: Improve Loop Cache Analysis LIT tests. Summary: Make LIT tests unsensitive to analysis output order. Authored By: etiotto llvm-svn: 368450 2019-08-10 00:18:22 +08:00			`; CHECK-DAG: Loop 'for.i' has cost = 20600`
			`; CHECK-DAG: Loop 'for.j' has cost = 800`
Title: Loop Cache Analysis Summary: Implement a new analysis to estimate the number of cache lines required by a loop nest. The analysis is largely based on the following paper: Compiler Optimizations for Improving Data Locality By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf The analysis considers temporal reuse (accesses to the same memory location) and spatial reuse (accesses to memory locations within a cache line). For simplicity the analysis considers memory accesses in the innermost loop in a loop nest, and thus determines the number of cache lines used when the loop L in loop nest LN is placed in the innermost position. The result of the analysis can be used to drive several transformations. As an example, loop interchange could use it determine which loops in a perfect loop nest should be interchanged to maximize cache reuse. Similarly, loop distribution could be enhanced to take into consideration cache reuse between arrays when distributing a loop to eliminate vectorization inhibiting dependencies. The general approach taken to estimate the number of cache lines used by the memory references in the inner loop of a loop nest is: Partition memory references that exhibit temporal or spatial reuse into reference groups. For each loop L in the a loop nest LN: a. Compute the cost of the reference group b. Compute the 'cache cost' of the loop nest by summing up the reference groups costs For further details of the algorithm please refer to the paper. Authored By: etiotto Reviewers: hfinkel, Meinersbur, jdoerfert, kbarton, bmahjour, anemet, fhahn Reviewed By: Meinersbur Subscribers: reames, nemanjai, MaskRay, wuzish, Hahnfeld, xusx595, venkataramanan.kumar.llvm, greened, dmgreen, steleman, fhahn, xblvaOO, Whitney, mgorny, hiraditya, mgrang, jsji, llvm-commits Tag: LLVM Differential Revision: https://reviews.llvm.org/D63459 llvm-svn: 368439 2019-08-09 21:56:29 +08:00
			`define void @foo(i64 %n, i64 %m, i32* %A, i32* %B, i32* %C) {`
			`entry:`
			`%cmp32 = icmp sgt i64 %n, 0`
			`%cmp230 = icmp sgt i64 %m, 0`
			`br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end`

			`for.cond1.preheader.lr.ph: ; preds = %entry`
			`br i1 %cmp230, label %for.i.preheader, label %for.end`

			`for.i.preheader: ; preds = %for.cond1.preheader.lr.ph`
			`br label %for.i`

			`for.i: ; preds = %for.inci, %for.i.preheader.split`
			`%i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader ]`
			`%subione = sub i64 %i, 1`
			`%addione = add i64 %i, 1`
			`%muli = mul i64 %i, %m`
			`%muliminusone = mul i64 %subione, %m`
			`%muliplusone = mul i64 %addione, %m`
			`br label %for.j`

			`for.j: ; preds = %for.incj, %for.i`
			`%j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ]`
			`%addj = add i64 %muli, %j`

			`; B[i-1][j]`
			`%arrayidx1 = add i64 %j, %muliminusone`
			`%arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %arrayidx1`
			`%elem_B1 = load i32, i32* %arrayidx2, align 4`

			`; B[i-1][j+1]`
			`%addjone = add i64 %j, 1`
			`%arrayidx3 = add i64 %addjone, %muliminusone`
			`%arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %arrayidx3`
			`%elem_B2 = load i32, i32* %arrayidx4, align 4`

			`; C[i]`
			`%arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %i`
			`%elem_C = load i32, i32* %arrayidx6, align 4`

			`; A[i][j+1]`
			`%arrayidx7 = add i64 %addjone, %muli`
			`%arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %arrayidx7`
			`%elem_A = load i32, i32* %arrayidx8, align 4`

			`; A[i][j] = A[i][j+1] + B[i-1][j] + B[i-1][j+1] + C[i]`
			`%addB = add i32 %elem_B1, %elem_B2`
			`%addC = add i32 %addB, %elem_C`
			`%addA = add i32 %elem_A, %elem_C`
			`%arrayidx9 = add i64 %j, %muli`
			`%arrayidx10 = getelementptr inbounds i32, i32* %A, i64 %arrayidx9`
			`store i32 %addA, i32* %arrayidx10, align 4`

			`; A[i][j] += B[i][i];`
			`%arrayidx11 = add i64 %j, %muli`
			`%arrayidx12 = getelementptr inbounds i32, i32* %A, i64 %arrayidx11`
			`%elem_A1 = load i32, i32* %arrayidx12, align 4`
			`%arrayidx13 = add i64 %i, %muli`
			`%arrayidx14 = getelementptr inbounds i32, i32* %B, i64 %arrayidx13`
			`%elem_B3 = load i32, i32* %arrayidx14, align 4`
			`%addA1 = add i32 %elem_A1, %elem_B3`
			`store i32 %addA1, i32* %arrayidx12, align 4`

			`br label %for.incj`

			`for.incj: ; preds = %for.j`
			`%incj = add nsw i64 %j, 1`
			`%exitcond54.us = icmp eq i64 %incj, %m`
			`br i1 %exitcond54.us, label %for.inci, label %for.j`

			`for.inci: ; preds = %for.incj`
			`%inci = add nsw i64 %i, 1`
			`%exitcond55.us = icmp eq i64 %inci, %n`
			`br i1 %exitcond55.us, label %for.end.loopexit, label %for.i`

			`for.end.loopexit: ; preds = %for.inci`
			`br label %for.end`

			`for.end: ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry`
			`ret void`
			`}`