2017-01-14 15:14:54 +08:00
|
|
|
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
|
|
|
; RUN: -polly-target-throughput-vector-fma=1 \
|
|
|
|
; RUN: -polly-target-latency-vector-fma=8 \
|
|
|
|
; RUN: -polly-target-1st-cache-level-associativity=8 \
|
|
|
|
; RUN: -polly-target-2nd-cache-level-associativity=8 \
|
|
|
|
; RUN: -polly-target-1st-cache-level-size=32768 \
|
|
|
|
; RUN: -polly-target-2nd-cache-level-size=262144 -polly-ast \
|
|
|
|
; RUN: -polly-target-vector-register-bitwidth=256 \
|
|
|
|
; RUN: -analyze < %s | FileCheck %s
|
2016-12-15 20:35:59 +08:00
|
|
|
;
|
|
|
|
; /* C := alpha*A*B + beta*C */
|
|
|
|
; /* _PB_NK % Kc != 0 */
|
|
|
|
; for (i = 0; i < _PB_NI; i++)
|
|
|
|
; for (j = 0; j < _PB_NJ; j++)
|
|
|
|
; {
|
|
|
|
; C[i][j] *= beta;
|
|
|
|
; for (k = 0; k < _PB_NK; ++k)
|
|
|
|
; C[i][j] += alpha * A[i][k] * B[k][j];
|
|
|
|
; }
|
|
|
|
;
|
|
|
|
; CHECK: {
|
|
|
|
; CHECK-NEXT: // 1st level tiling - Tiles
|
|
|
|
; CHECK-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1)
|
|
|
|
; CHECK-NEXT: for (int c1 = 0; c1 <= 32; c1 += 1) {
|
|
|
|
; CHECK-NEXT: // 1st level tiling - Points
|
|
|
|
; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1)
|
|
|
|
; CHECK-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1)
|
|
|
|
; CHECK-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3);
|
|
|
|
; CHECK-NEXT: }
|
2017-03-22 22:25:24 +08:00
|
|
|
; CHECK-NEXT: // Inter iteration alias-free
|
2016-12-15 20:35:59 +08:00
|
|
|
; CHECK-NEXT: // 1st level tiling - Tiles
|
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
|
|
|
; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) {
|
|
|
|
; CHECK-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1)
|
|
|
|
; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1)
|
|
|
|
; CHECK-NEXT: CopyStmt_0(0, c3, c4);
|
|
|
|
; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) {
|
|
|
|
; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1)
|
|
|
|
; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(1022, 256 * c1 + 255); c5 += 1)
|
|
|
|
; CHECK-NEXT: CopyStmt_1(c3, 0, c5);
|
|
|
|
; CHECK-NEXT: // 1st level tiling - Points
|
|
|
|
; CHECK-NEXT: // Register tiling - Tiles
|
|
|
|
; CHECK-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1)
|
|
|
|
; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1)
|
|
|
|
; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) {
|
2017-08-23 01:38:46 +08:00
|
|
|
; CHECK-NEXT: // Loop Vectorizer Disabled
|
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
|
|
|
; CHECK-NEXT: // Register tiling - Points
|
|
|
|
; CHECK-NEXT: {
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
|
|
|
|
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
|
2016-12-15 20:35:59 +08:00
|
|
|
; CHECK-NEXT: }
|
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
|
|
|
; CHECK-NEXT: }
|
2016-12-15 20:35:59 +08:00
|
|
|
; CHECK-NEXT: }
|
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
|
|
|
; CHECK-NEXT: }
|
2016-12-15 20:35:59 +08:00
|
|
|
; CHECK-NEXT: }
|
|
|
|
;
|
|
|
|
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
|
|
|
target triple = "x86_64-unknown-unknown"
|
|
|
|
|
|
|
|
define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1023 x double]* %arg6, [1056 x double]* %arg7) #0 {
|
|
|
|
bb:
|
|
|
|
br label %bb8
|
|
|
|
|
|
|
|
bb8: ; preds = %bb29, %bb
|
|
|
|
%tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ]
|
|
|
|
br label %bb9
|
|
|
|
|
|
|
|
bb9: ; preds = %bb26, %bb8
|
|
|
|
%tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ]
|
|
|
|
%tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10
|
|
|
|
%tmp12 = load double, double* %tmp11, align 8
|
|
|
|
%tmp13 = fmul double %tmp12, %arg4
|
|
|
|
store double %tmp13, double* %tmp11, align 8
|
|
|
|
br label %Copy_0
|
|
|
|
|
|
|
|
Copy_0: ; preds = %Copy_0, %bb9
|
|
|
|
%tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ]
|
|
|
|
%tmp16 = getelementptr inbounds [1023 x double], [1023 x double]* %arg6, i64 %tmp, i64 %tmp15
|
|
|
|
%tmp17 = load double, double* %tmp16, align 8
|
|
|
|
%tmp18 = fmul double %tmp17, %arg3
|
|
|
|
%tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10
|
|
|
|
%tmp20 = load double, double* %tmp19, align 8
|
|
|
|
%tmp21 = fmul double %tmp18, %tmp20
|
|
|
|
%tmp22 = load double, double* %tmp11, align 8
|
|
|
|
%tmp23 = fadd double %tmp22, %tmp21
|
|
|
|
store double %tmp23, double* %tmp11, align 8
|
|
|
|
%tmp24 = add nuw nsw i64 %tmp15, 1
|
|
|
|
%tmp25 = icmp ne i64 %tmp24, 1023
|
|
|
|
br i1 %tmp25, label %Copy_0, label %bb26
|
|
|
|
|
|
|
|
bb26: ; preds = %Copy_0
|
|
|
|
%tmp27 = add nuw nsw i64 %tmp10, 1
|
|
|
|
%tmp28 = icmp ne i64 %tmp27, 1056
|
|
|
|
br i1 %tmp28, label %bb9, label %bb29
|
|
|
|
|
|
|
|
bb29: ; preds = %bb26
|
|
|
|
%tmp30 = add nuw nsw i64 %tmp, 1
|
|
|
|
%tmp31 = icmp ne i64 %tmp30, 1056
|
|
|
|
br i1 %tmp31, label %bb8, label %bb32
|
|
|
|
|
|
|
|
bb32: ; preds = %bb29
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }
|