forked from OSchip/llvm-project
parent
1fbaffeba1
commit
de56903bde
|
@ -123,7 +123,7 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
|
|||
"are incurred."));
|
||||
|
||||
static cl::opt<bool> MaximizeBandwidth(
|
||||
"vectorizer-maximize-bandwidth", cl::init(true), cl::Hidden,
|
||||
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
||||
cl::desc("Maximize bandwidth when selecting vectorization factor which "
|
||||
"will be determined by the smallest type in loop."));
|
||||
|
||||
|
|
|
@ -88,9 +88,9 @@ for.body: ; preds = %entry, %for.body
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @add_c(
|
||||
; CHECK: load <16 x i8>, <16 x i8>*
|
||||
; CHECK: add <16 x i16>
|
||||
; CHECK: store <16 x i16>
|
||||
; CHECK: load <8 x i8>, <8 x i8>*
|
||||
; CHECK: add <8 x i16>
|
||||
; CHECK: store <8 x i16>
|
||||
; Function Attrs: nounwind
|
||||
define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
|
||||
entry:
|
||||
|
@ -116,9 +116,9 @@ for.body: ; preds = %entry, %for.body
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @add_d(
|
||||
; CHECK: load <8 x i16>
|
||||
; CHECK: add nsw <8 x i32>
|
||||
; CHECK: store <8 x i32>
|
||||
; CHECK: load <4 x i16>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: store <4 x i32>
|
||||
define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
|
||||
entry:
|
||||
%cmp7 = icmp sgt i32 %len, 0
|
||||
|
@ -187,16 +187,16 @@ for.body: ; preds = %for.body, %for.body
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @add_f
|
||||
; CHECK: load <16 x i16>
|
||||
; CHECK: trunc <16 x i16>
|
||||
; CHECK: shl <16 x i8>
|
||||
; CHECK: add <16 x i8>
|
||||
; CHECK: or <16 x i8>
|
||||
; CHECK: mul <16 x i8>
|
||||
; CHECK: and <16 x i8>
|
||||
; CHECK: xor <16 x i8>
|
||||
; CHECK: mul <16 x i8>
|
||||
; CHECK: store <16 x i8>
|
||||
; CHECK: load <8 x i16>
|
||||
; CHECK: trunc <8 x i16>
|
||||
; CHECK: shl <8 x i8>
|
||||
; CHECK: add <8 x i8>
|
||||
; CHECK: or <8 x i8>
|
||||
; CHECK: mul <8 x i8>
|
||||
; CHECK: and <8 x i8>
|
||||
; CHECK: xor <8 x i8>
|
||||
; CHECK: mul <8 x i8>
|
||||
; CHECK: store <8 x i8>
|
||||
define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
|
||||
entry:
|
||||
%cmp.32 = icmp sgt i32 %len, 0
|
||||
|
|
|
@ -123,16 +123,16 @@ for.body:
|
|||
; }
|
||||
;
|
||||
; CHECK: vector.body:
|
||||
; CHECK: phi <16 x i16>
|
||||
; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
|
||||
; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16>
|
||||
; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
|
||||
; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16>
|
||||
; CHECK: add <16 x i16>
|
||||
; CHECK: add <16 x i16>
|
||||
; CHECK: phi <8 x i16>
|
||||
; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
|
||||
; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16>
|
||||
; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
|
||||
; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16>
|
||||
; CHECK: add <8 x i16>
|
||||
; CHECK: add <8 x i16>
|
||||
;
|
||||
; CHECK: middle.block:
|
||||
; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>
|
||||
; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
|
||||
; CHECK: zext i16 [[Rdx]] to i32
|
||||
;
|
||||
define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
|
||||
|
|
|
@ -35,9 +35,9 @@ define void @example1() nounwind uwtable ssp {
|
|||
}
|
||||
|
||||
;CHECK-LABEL: @example10b(
|
||||
;CHECK: load <8 x i16>
|
||||
;CHECK: sext <8 x i16>
|
||||
;CHECK: store <8 x i32>
|
||||
;CHECK: load <4 x i16>
|
||||
;CHECK: sext <4 x i16>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret void
|
||||
define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
|
||||
br label %1
|
||||
|
|
|
@ -9,9 +9,7 @@ target triple = "x86_64-apple-macosx"
|
|||
|
||||
; If we need to scalarize the fptoui and then use inserts to build up the
|
||||
; vector again, then there is certainly no value in going 256-bit wide.
|
||||
; But as we default to maximize bandwidth, we should convert it to 256-bit
|
||||
; anyway.
|
||||
; CHECK: vpinsrd
|
||||
; CHECK-NOT: vpinsrd
|
||||
|
||||
define void @convert() {
|
||||
entry:
|
||||
|
|
|
@ -44,16 +44,17 @@ define void @example1() nounwind uwtable ssp {
|
|||
ret void
|
||||
}
|
||||
|
||||
; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
|
||||
;CHECK-LABEL: @example10b(
|
||||
;CHECK: load <8 x i16>
|
||||
;CHECK: sext <8 x i16>
|
||||
;CHECK: store <8 x i32>
|
||||
;CHECK: load <4 x i16>
|
||||
;CHECK: sext <4 x i16>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret void
|
||||
;UNROLL-LABEL: @example10b(
|
||||
;UNROLL: load <8 x i16>
|
||||
;UNROLL: load <8 x i16>
|
||||
;UNROLL: store <8 x i32>
|
||||
;UNROLL: store <8 x i32>
|
||||
;UNROLL: load <4 x i16>
|
||||
;UNROLL: load <4 x i16>
|
||||
;UNROLL: store <4 x i32>
|
||||
;UNROLL: store <4 x i32>
|
||||
;UNROLL: ret void
|
||||
define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
|
||||
br label %1
|
||||
|
|
|
@ -260,28 +260,20 @@ for.end: ; preds = %for.cond
|
|||
; }
|
||||
;}
|
||||
|
||||
;AVX1-LABEL: @foo3
|
||||
;AVX1: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
|
||||
;AVX1: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
|
||||
;AVX1: sitofp <4 x i32> %wide.load to <4 x double>
|
||||
;AVX1: fadd <4 x double>
|
||||
;AVX1: call void @llvm.masked.store.v4f64.p0v4f64
|
||||
;AVX1: ret void
|
||||
|
||||
;AVX2-LABEL: @foo3
|
||||
;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
|
||||
;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
|
||||
;AVX2: sitofp <8 x i32> %wide.load to <8 x double>
|
||||
;AVX2: fadd <8 x double>
|
||||
;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
|
||||
;AVX2: ret void
|
||||
;AVX-LABEL: @foo3
|
||||
;AVX: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
|
||||
;AVX: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
|
||||
;AVX: sitofp <4 x i32> %wide.load to <4 x double>
|
||||
;AVX: fadd <4 x double>
|
||||
;AVX: call void @llvm.masked.store.v4f64.p0v4f64
|
||||
;AVX: ret void
|
||||
|
||||
;AVX512-LABEL: @foo3
|
||||
;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100,
|
||||
;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
|
||||
;AVX512: sitofp <16 x i32> %wide.load to <16 x double>
|
||||
;AVX512: fadd <16 x double>
|
||||
;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
|
||||
;AVX512: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
|
||||
;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
|
||||
;AVX512: sitofp <8 x i32> %wide.load to <8 x double>
|
||||
;AVX512: fadd <8 x double>
|
||||
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
|
||||
;AVX512: ret void
|
||||
|
||||
|
||||
|
@ -510,19 +502,19 @@ for.end: ; preds = %for.cond
|
|||
; }
|
||||
;}
|
||||
;AVX2-LABEL: @foo6
|
||||
;AVX2: icmp sgt <8 x i32> %reverse, zeroinitializer
|
||||
;AVX2: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
|
||||
;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
|
||||
;AVX2: fadd <8 x double>
|
||||
;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
|
||||
;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
|
||||
;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
||||
;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
|
||||
;AVX2: fadd <4 x double>
|
||||
;AVX2: call void @llvm.masked.store.v4f64.p0v4f64
|
||||
;AVX2: ret void
|
||||
|
||||
;AVX512-LABEL: @foo6
|
||||
;AVX512: icmp sgt <16 x i32> %reverse, zeroinitializer
|
||||
;AVX512: shufflevector <16 x i1>{{.*}}<16 x i32> <i32 15, i32 14, i32 13, i32 12
|
||||
;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
|
||||
;AVX512: fadd <16 x double>
|
||||
;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
|
||||
;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
|
||||
;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
|
||||
;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
|
||||
;AVX512: fadd <8 x double>
|
||||
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
|
||||
;AVX512: ret void
|
||||
|
||||
|
||||
|
@ -590,8 +582,8 @@ for.end: ; preds = %for.cond
|
|||
; }
|
||||
|
||||
;AVX512-LABEL: @foo7
|
||||
;AVX512: call <64 x double*> @llvm.masked.load.v64p0f64.p0v64p0f64(<64 x double*>*
|
||||
;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
|
||||
;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>*
|
||||
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
|
||||
;AVX512: ret void
|
||||
|
||||
define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
|
||||
|
@ -662,8 +654,8 @@ for.end: ; preds = %for.cond
|
|||
;}
|
||||
|
||||
;AVX512-LABEL: @foo8
|
||||
;AVX512: call <64 x i32 ()*> @llvm.masked.load.v64p0f_i32f.p0v64p0f_i32f(<64 x i32 ()*>* %
|
||||
;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
|
||||
;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* %
|
||||
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
|
||||
;AVX512: ret void
|
||||
|
||||
define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
|
||||
; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
|
||||
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2)
|
||||
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.10.0"
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
|
||||
; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
|
||||
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) (hotness: 300)
|
||||
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.10.0"
|
||||
|
|
|
@ -7,7 +7,7 @@ target triple = "i386-apple-darwin"
|
|||
define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
|
||||
; CHECK-LABEL: @test1(
|
||||
; CHECK: preheader
|
||||
; CHECK: insertelement <4 x double> zeroinitializer, double %tmp, i32 0
|
||||
; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
|
||||
; CHECK: vector.memcheck
|
||||
|
||||
bb:
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
; DEBUG-OUTPUT-NOT: .loc
|
||||
; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
|
||||
|
||||
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
|
||||
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
|
||||
; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
|
||||
; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
; DEBUG-OUTPUT-NOT: .loc
|
||||
; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
|
||||
|
||||
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
|
||||
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
|
||||
; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
|
||||
; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
|
||||
|
||||
|
|
Loading…
Reference in New Issue