2017-01-14 15:14:54 +08:00
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
; RUN: -polly-target-throughput-vector-fma=1 \
; RUN: -polly-target-latency-vector-fma=8 \
; RUN: -analyze -polly-ast -polly-target-1st-cache-level-size=0 \
; RUN: -polly-target-vector-register-bitwidth=256 \
; RUN: < %s 2>&1 | FileCheck %s
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
; RUN: -polly-target-throughput-vector-fma=1 \
; RUN: -polly-target-latency-vector-fma=8 \
; RUN: -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 \
; RUN: -polly-target-2nd-cache-level-associativity=8 \
; RUN: -polly-target-1st-cache-level-size=32768 \
; RUN: -polly-target-vector-register-bitwidth=256 \
; RUN: -polly-target-2nd-cache-level-size=262144 < %s 2>&1 \
; RUN: | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
2016-06-22 17:52:37 +08:00
;
; /* C := alpha*A*B + beta*C */
; for (i = 0; i < _PB_NI; i++)
; for (j = 0; j < _PB_NJ; j++)
; {
; C[i][j] *= beta;
; for (k = 0; k < _PB_NK; ++k)
; C[i][j] += alpha * A[i][k] * B[k][j];
; }
;
2020-07-11 05:05:42 +08:00
; CHECK-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
2016-06-22 17:52:37 +08:00
; CHECK: {
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; CHECK-NEXT: // 1st level tiling - Tiles
; CHECK-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1)
; CHECK-NEXT: for (int c1 = 0; c1 <= 32; c1 += 1) {
; CHECK-NEXT: // 1st level tiling - Points
; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1)
; CHECK-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1)
2017-02-02 22:23:14 +08:00
; CHECK-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; CHECK-NEXT: }
2017-03-22 22:25:24 +08:00
; CHECK-NEXT: // Inter iteration alias-free
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; CHECK-NEXT: // Register tiling - Tiles
; CHECK-NEXT: for (int c0 = 0; c0 <= 131; c0 += 1)
; CHECK-NEXT: for (int c1 = 0; c1 <= 263; c1 += 1)
; CHECK-NEXT: for (int c2 = 0; c2 <= 1023; c2 += 1) {
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
2017-02-02 22:23:14 +08:00
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 1, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 2, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 3, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 4, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 5, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 6, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 7, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 1, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 2, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 3, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 4, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 5, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 6, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 7, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 1, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 2, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 3, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 4, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 5, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 6, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 7, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 1, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 2, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 3, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 4, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 5, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 6, c2);
; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 7, c2);
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; CHECK-NEXT: }
; CHECK-NEXT: }
; CHECK-NEXT: }
2016-06-22 17:52:37 +08:00
;
2020-07-11 05:05:42 +08:00
; EXTRACTION-OF-MACRO-KERNEL-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; EXTRACTION-OF-MACRO-KERNEL: {
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 32; c1 += 1) {
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1)
2017-02-02 22:23:14 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: }
2017-03-22 22:25:24 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Inter iteration alias-free
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles
2017-02-02 22:23:14 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) {
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: CopyStmt_0(0, c3, c4);
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) {
2021-06-06 19:26:24 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c6 = 96 * c2; c6 <= 96 * c2 + 95; c6 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c7 = 256 * c1; c7 <= 256 * c1 + 255; c7 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: CopyStmt_1(0, c1, c2, c6, c7);
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Tiles
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) {
2017-08-23 01:38:46 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Loop Vectorizer Disabled
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points
; EXTRACTION-OF-MACRO-KERNEL-NEXT: {
2017-02-02 22:23:14 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: }
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: }
; EXTRACTION-OF-MACRO-KERNEL-NEXT: }
2017-02-02 22:23:14 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: }
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
; EXTRACTION-OF-MACRO-KERNEL-NEXT: }
2016-07-25 17:42:53 +08:00
;
2016-06-22 17:52:37 +08:00
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define internal void @kernel_gemm ( i32 %arg , i32 %arg1 , i32 %arg2 , double %arg3 , double %arg4 , [ 1056 x double ] * %arg5 , [ 1024 x double ] * %arg6 , [ 1056 x double ] * %arg7 ) #0 {
bb:
br label %bb8
2017-02-02 22:23:14 +08:00
bb8: ; preds = %bb29, %bb
%tmp = phi i64 [ 0 , %bb ] , [ %tmp30 , %bb29 ]
br label %bb9
2016-06-22 17:52:37 +08:00
2017-02-02 22:23:14 +08:00
bb9: ; preds = %bb26, %bb8
%tmp10 = phi i64 [ 0 , %bb8 ] , [ %tmp27 , %bb26 ]
%tmp11 = getelementptr inbounds [ 1056 x double ] , [ 1056 x double ] * %arg5 , i64 %tmp , i64 %tmp10
%tmp12 = load double , double * %tmp11 , align 8
%tmp13 = fmul double %tmp12 , %arg4
store double %tmp13 , double * %tmp11 , align 8
br label %Copy_0
2016-06-22 17:52:37 +08:00
2017-02-02 22:23:14 +08:00
Copy_0: ; preds = %Copy_0, %bb9
%tmp15 = phi i64 [ 0 , %bb9 ] , [ %tmp24 , %Copy_0 ]
%tmp16 = getelementptr inbounds [ 1024 x double ] , [ 1024 x double ] * %arg6 , i64 %tmp , i64 %tmp15
%tmp17 = load double , double * %tmp16 , align 8
%tmp18 = fmul double %tmp17 , %arg3
%tmp19 = getelementptr inbounds [ 1056 x double ] , [ 1056 x double ] * %arg7 , i64 %tmp15 , i64 %tmp10
%tmp20 = load double , double * %tmp19 , align 8
%tmp21 = fmul double %tmp18 , %tmp20
%tmp22 = load double , double * %tmp11 , align 8
%tmp23 = fadd double %tmp22 , %tmp21
store double %tmp23 , double * %tmp11 , align 8
%tmp24 = add nuw nsw i64 %tmp15 , 1
%tmp25 = icmp ne i64 %tmp24 , 1024
br i1 %tmp25 , label %Copy_0 , label %bb26
2016-06-22 17:52:37 +08:00
2017-02-02 22:23:14 +08:00
bb26: ; preds = %Copy_0
%tmp27 = add nuw nsw i64 %tmp10 , 1
%tmp28 = icmp ne i64 %tmp27 , 1056
br i1 %tmp28 , label %bb9 , label %bb29
2016-06-22 17:52:37 +08:00
2017-02-02 22:23:14 +08:00
bb29: ; preds = %bb26
%tmp30 = add nuw nsw i64 %tmp , 1
%tmp31 = icmp ne i64 %tmp30 , 1056
br i1 %tmp31 , label %bb8 , label %bb32
2016-06-22 17:52:37 +08:00
2017-02-02 22:23:14 +08:00
bb32: ; preds = %bb29
2016-06-22 17:52:37 +08:00
ret void
}