[NFC][X86][LV] Add basic costmodel test coverage for not-fully-interleaved i32 loads

The coverage could have cumulative explosion here,
so i'm adding only the most basic cases,
and hoping it's enough, though more can be added if needed.
This commit is contained in:
Roman Lebedev 2021-10-05 19:28:23 +03:00
parent 16b8f4ddae
commit 200edc152b
No known key found for this signature in database
GPG Key ID: 083C3EBB4A1689E0
6 changed files with 441 additions and 0 deletions

View File

@ -0,0 +1,71 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 2
define void @test() {
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%iv.0 = add nuw nsw i64 %iv, 0
%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%v0 = load i32, i32* %in0
%reduce.add.0 = add i32 %v0, 0
%reduce.add.0.narrow = trunc i32 %reduce.add.0 to i8
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.0.narrow, i8* %out
%iv.next = add nuw nsw i64 %iv.0, 2
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
ret void
}

View File

@ -0,0 +1,75 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 4
define void @test() {
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1
%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
%v0 = load i32, i32* %in0
%v1 = load i32, i32* %in1
%reduce.add.0 = add i32 %v0, %v1
%reduce.add.1 = add i32 %reduce.add.0, 0
%reduce.add.1.narrow = trunc i32 %reduce.add.1 to i8
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.1.narrow, i8* %out
%iv.next = add nuw nsw i64 %iv.0, 3
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
ret void
}

View File

@ -0,0 +1,70 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
define void @test() {
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%iv.0 = add nuw nsw i64 %iv, 0
%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%v0 = load i32, i32* %in0
%reduce.add.0 = add i32 %v0, 0
%reduce.add.1 = add i32 %reduce.add.0, 0
%reduce.add.1.narrow = trunc i32 %reduce.add.1 to i8
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.1.narrow, i8* %out
%iv.next = add nuw nsw i64 %iv.0, 3
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
ret void
}

View File

@ -0,0 +1,76 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 4
define void @test() {
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1
%iv.2 = add nuw nsw i64 %iv, 2
%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
%in2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.2
%v0 = load i32, i32* %in0
%v1 = load i32, i32* %in1
%v2 = load i32, i32* %in2
%reduce.add.0 = add i32 %v0, %v1
%reduce.add.1 = add i32 %reduce.add.0, %v2
%reduce.add.2 = add i32 %reduce.add.1, 0
%reduce.add.2.narrow = trunc i32 %reduce.add.2 to i8
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.2.narrow, i8* %out
%iv.next = add nuw nsw i64 %iv.0, 4
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
ret void
}

View File

@ -0,0 +1,76 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 4
define void @test() {
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%iv.0 = add nuw nsw i64 %iv, 0
%iv.1 = add nuw nsw i64 %iv, 1
%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
%v0 = load i32, i32* %in0
%v1 = load i32, i32* %in1
%reduce.add.0 = add i32 %v0, %v1
%reduce.add.1 = add i32 %reduce.add.0, 0
%reduce.add.2 = add i32 %reduce.add.1, 0
%reduce.add.2.narrow = trunc i32 %reduce.add.2 to i8
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.2.narrow, i8* %out
%iv.next = add nuw nsw i64 %iv.0, 4
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
ret void
}

View File

@ -0,0 +1,73 @@
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@A = global [1024 x i32] zeroinitializer, align 128
@B = global [1024 x i8] zeroinitializer, align 128
; CHECK: LV: Checking a loop in "test"
;
; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 220 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX2: LV: Found an estimated cost of 440 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
;
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, i32* %in0, align 4
; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, i32* %in0, align 4
;
; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %v0 = load i32, i32* %in0, align 4
define void @test() {
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%iv.0 = add nuw nsw i64 %iv, 0
%in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
%v0 = load i32, i32* %in0
%reduce.add.0 = add i32 %v0, 0
%reduce.add.1 = add i32 %reduce.add.0, 0
%reduce.add.2 = add i32 %reduce.add.1, 0
%reduce.add.2.narrow = trunc i32 %reduce.add.2 to i8
%out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
store i8 %reduce.add.2.narrow, i8* %out
%iv.next = add nuw nsw i64 %iv.0, 4
%cmp = icmp ult i64 %iv.next, 1024
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
ret void
}