2014-09-11 01:58:16 +08:00
|
|
|
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
|
2012-10-20 07:05:40 +08:00
|
|
|
|
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
|
|
|
target triple = "x86_64-apple-macosx10.8.0"
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_sum(
|
2012-10-20 07:05:40 +08:00
|
|
|
;CHECK: phi <4 x i32>
|
|
|
|
;CHECK: load <4 x i32>
|
|
|
|
;CHECK: add <4 x i32>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: add <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: add <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-20 07:05:40 +08:00
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
|
|
|
|
%1 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %1, label %.lr.ph, label %._crit_edge
|
|
|
|
|
|
|
|
.lr.ph: ; preds = %0, %.lr.ph
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
|
|
|
|
%sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
|
|
|
|
%2 = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%3 = load i32* %2, align 4
|
|
|
|
%4 = getelementptr inbounds i32* %B, i64 %indvars.iv
|
|
|
|
%5 = load i32* %4, align 4
|
|
|
|
%6 = trunc i64 %indvars.iv to i32
|
|
|
|
%7 = add i32 %sum.02, %6
|
|
|
|
%8 = add i32 %7, %3
|
|
|
|
%9 = add i32 %8, %5
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
|
|
|
|
|
|
|
._crit_edge: ; preds = %.lr.ph, %0
|
|
|
|
%sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
|
|
|
|
ret i32 %sum.0.lcssa
|
|
|
|
}
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_prod(
|
2012-10-20 07:05:40 +08:00
|
|
|
;CHECK: phi <4 x i32>
|
|
|
|
;CHECK: load <4 x i32>
|
|
|
|
;CHECK: mul <4 x i32>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: mul <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: mul <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-20 07:05:40 +08:00
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
|
|
|
|
%1 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %1, label %.lr.ph, label %._crit_edge
|
|
|
|
|
|
|
|
.lr.ph: ; preds = %0, %.lr.ph
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
|
|
|
|
%prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
|
|
|
|
%2 = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%3 = load i32* %2, align 4
|
|
|
|
%4 = getelementptr inbounds i32* %B, i64 %indvars.iv
|
|
|
|
%5 = load i32* %4, align 4
|
|
|
|
%6 = trunc i64 %indvars.iv to i32
|
|
|
|
%7 = mul i32 %prod.02, %6
|
|
|
|
%8 = mul i32 %7, %3
|
|
|
|
%9 = mul i32 %8, %5
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
|
|
|
|
|
|
|
._crit_edge: ; preds = %.lr.ph, %0
|
|
|
|
%prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
|
|
|
|
ret i32 %prod.0.lcssa
|
|
|
|
}
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_mix(
|
2012-10-20 07:05:40 +08:00
|
|
|
;CHECK: phi <4 x i32>
|
|
|
|
;CHECK: load <4 x i32>
|
2012-11-01 05:40:39 +08:00
|
|
|
;CHECK: mul nsw <4 x i32>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: add <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: add <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-20 07:05:40 +08:00
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
|
|
|
|
%1 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %1, label %.lr.ph, label %._crit_edge
|
|
|
|
|
|
|
|
.lr.ph: ; preds = %0, %.lr.ph
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
|
|
|
|
%sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
|
|
|
|
%2 = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%3 = load i32* %2, align 4
|
|
|
|
%4 = getelementptr inbounds i32* %B, i64 %indvars.iv
|
|
|
|
%5 = load i32* %4, align 4
|
|
|
|
%6 = mul nsw i32 %5, %3
|
|
|
|
%7 = trunc i64 %indvars.iv to i32
|
|
|
|
%8 = add i32 %sum.02, %7
|
|
|
|
%9 = add i32 %8, %6
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
|
|
|
|
|
|
|
._crit_edge: ; preds = %.lr.ph, %0
|
|
|
|
%sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
|
|
|
|
ret i32 %sum.0.lcssa
|
|
|
|
}
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_mul(
|
2012-10-21 13:52:51 +08:00
|
|
|
;CHECK: mul <4 x i32>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: mul <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: mul <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-20 07:05:40 +08:00
|
|
|
;CHECK: ret i32
|
2012-10-21 13:52:51 +08:00
|
|
|
define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
|
2012-10-20 07:05:40 +08:00
|
|
|
%1 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %1, label %.lr.ph, label %._crit_edge
|
|
|
|
|
|
|
|
.lr.ph: ; preds = %0, %.lr.ph
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
|
2012-10-21 13:52:51 +08:00
|
|
|
%sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
|
2012-10-20 07:05:40 +08:00
|
|
|
%2 = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%3 = load i32* %2, align 4
|
|
|
|
%4 = getelementptr inbounds i32* %B, i64 %indvars.iv
|
|
|
|
%5 = load i32* %4, align 4
|
|
|
|
%6 = trunc i64 %indvars.iv to i32
|
|
|
|
%7 = add i32 %3, %6
|
|
|
|
%8 = add i32 %7, %5
|
|
|
|
%9 = mul i32 %8, %sum.02
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
|
|
|
|
|
|
|
._crit_edge: ; preds = %.lr.ph, %0
|
|
|
|
%sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
|
|
|
|
ret i32 %sum.0.lcssa
|
|
|
|
}
|
2012-10-21 13:52:51 +08:00
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @start_at_non_zero(
|
2012-10-21 13:52:51 +08:00
|
|
|
;CHECK: phi <4 x i32>
|
|
|
|
;CHECK: <i32 120, i32 0, i32 0, i32 0>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: add <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: add <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-21 13:52:51 +08:00
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
|
|
|
|
entry:
|
|
|
|
%cmp7 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp7, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
|
|
%sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
|
|
|
|
%arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
|
|
|
|
%0 = load i32* %arrayidx, align 4
|
|
|
|
%arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
|
|
|
|
%1 = load i32* %arrayidx2, align 4
|
|
|
|
%mul = mul nsw i32 %1, %0
|
|
|
|
%add = add nsw i32 %mul, %sum.09
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
|
|
%sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
|
|
|
|
ret i32 %sum.0.lcssa
|
|
|
|
}
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_and(
|
2012-10-25 08:08:41 +08:00
|
|
|
;CHECK: and <4 x i32>
|
2012-10-31 02:12:36 +08:00
|
|
|
;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: and <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: and <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-25 08:08:41 +08:00
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
|
|
|
|
entry:
|
|
|
|
%cmp7 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp7, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
|
|
%result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
|
|
|
|
%arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%0 = load i32* %arrayidx, align 4
|
|
|
|
%arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
|
|
|
|
%1 = load i32* %arrayidx2, align 4
|
|
|
|
%add = add nsw i32 %1, %0
|
|
|
|
%and = and i32 %add, %result.08
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
2012-10-21 13:52:51 +08:00
|
|
|
|
2012-10-25 08:08:41 +08:00
|
|
|
for.end: ; preds = %for.body, %entry
|
|
|
|
%result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
|
|
|
|
ret i32 %result.0.lcssa
|
|
|
|
}
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_or(
|
2012-10-25 08:08:41 +08:00
|
|
|
;CHECK: or <4 x i32>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: or <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: or <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-25 08:08:41 +08:00
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
|
|
|
|
entry:
|
|
|
|
%cmp7 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp7, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
|
|
%result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
|
|
|
|
%arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%0 = load i32* %arrayidx, align 4
|
|
|
|
%arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
|
|
|
|
%1 = load i32* %arrayidx2, align 4
|
|
|
|
%add = add nsw i32 %1, %0
|
|
|
|
%or = or i32 %add, %result.08
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
|
|
%result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
|
|
|
|
ret i32 %result.0.lcssa
|
|
|
|
}
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_xor(
|
2012-10-25 08:08:41 +08:00
|
|
|
;CHECK: xor <4 x i32>
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
|
|
;CHECK: xor <4 x i32>
|
|
|
|
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
|
|
;CHECK: xor <4 x i32>
|
|
|
|
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
|
2012-10-25 08:08:41 +08:00
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
|
|
|
|
entry:
|
|
|
|
%cmp7 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp7, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
|
|
%result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
|
|
|
|
%arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%0 = load i32* %arrayidx, align 4
|
|
|
|
%arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
|
|
|
|
%1 = load i32* %arrayidx2, align 4
|
|
|
|
%add = add nsw i32 %1, %0
|
|
|
|
%xor = xor i32 %add, %result.08
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
|
|
%result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
|
|
|
|
ret i32 %result.0.lcssa
|
|
|
|
}
|
2013-01-05 06:10:16 +08:00
|
|
|
|
2013-01-05 09:15:47 +08:00
|
|
|
; In this code the subtracted variable is on the RHS and this is not an induction variable.
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_sub_rhs(
|
2013-01-05 06:10:16 +08:00
|
|
|
;CHECK-NOT: phi <4 x i32>
|
|
|
|
;CHECK-NOT: sub nsw <4 x i32>
|
|
|
|
;CHECK: ret i32
|
2013-01-05 09:15:47 +08:00
|
|
|
define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
|
2013-01-05 06:10:16 +08:00
|
|
|
entry:
|
|
|
|
%cmp4 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp4, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
|
|
%x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
|
|
|
|
%arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%0 = load i32* %arrayidx, align 4
|
|
|
|
%sub = sub nsw i32 %0, %x.05
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
|
|
%x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
|
|
|
|
ret i32 %x.0.lcssa
|
|
|
|
}
|
2013-01-05 09:15:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
; In this test the reduction variable is on the LHS and we can vectorize it.
|
2013-07-14 09:42:54 +08:00
|
|
|
;CHECK-LABEL: @reduction_sub_lhs(
|
2013-01-05 09:15:47 +08:00
|
|
|
;CHECK: phi <4 x i32>
|
|
|
|
;CHECK: sub nsw <4 x i32>
|
|
|
|
;CHECK: ret i32
|
|
|
|
define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
|
|
|
|
entry:
|
|
|
|
%cmp4 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp4, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
|
|
%x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
|
|
|
|
%arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
|
|
|
|
%0 = load i32* %arrayidx, align 4
|
|
|
|
%sub = sub nsw i32 %x.05, %0
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
|
|
%x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
|
|
|
|
ret i32 %x.0.lcssa
|
|
|
|
}
|
2013-05-08 05:55:37 +08:00
|
|
|
|
|
|
|
; We can vectorize conditional reductions with multi-input phis.
|
|
|
|
; CHECK: reduction_conditional
|
|
|
|
; CHECK: fadd <4 x float>
|
|
|
|
|
|
|
|
define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
|
|
|
|
entry:
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body:
|
|
|
|
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
|
|
|
|
%sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
|
|
|
|
%arrayidx = getelementptr inbounds float* %A, i64 %indvars.iv
|
|
|
|
%0 = load float* %arrayidx, align 4
|
|
|
|
%arrayidx2 = getelementptr inbounds float* %B, i64 %indvars.iv
|
|
|
|
%1 = load float* %arrayidx2, align 4
|
|
|
|
%cmp3 = fcmp ogt float %0, %1
|
|
|
|
br i1 %cmp3, label %if.then, label %for.inc
|
|
|
|
|
|
|
|
if.then:
|
|
|
|
%cmp6 = fcmp ogt float %1, 1.000000e+00
|
|
|
|
br i1 %cmp6, label %if.then8, label %if.else
|
|
|
|
|
|
|
|
if.then8:
|
|
|
|
%add = fadd fast float %sum.033, %0
|
|
|
|
br label %for.inc
|
|
|
|
|
|
|
|
if.else:
|
|
|
|
%cmp14 = fcmp ogt float %0, 2.000000e+00
|
|
|
|
br i1 %cmp14, label %if.then16, label %for.inc
|
|
|
|
|
|
|
|
if.then16:
|
|
|
|
%add19 = fadd fast float %sum.033, %1
|
|
|
|
br label %for.inc
|
|
|
|
|
|
|
|
for.inc:
|
|
|
|
%sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp ne i32 %lftr.wideiv, 128
|
|
|
|
br i1 %exitcond, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.end:
|
|
|
|
%sum.1.lcssa = phi float [ %sum.1, %for.inc ]
|
|
|
|
ret float %sum.1.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
; We can't vectorize reductions with phi inputs from outside the reduction.
|
|
|
|
; CHECK: noreduction_phi
|
|
|
|
; CHECK-NOT: fadd <4 x float>
|
|
|
|
define float @noreduction_phi(float* %A, float* %B, float* %C, float %S) {
|
|
|
|
entry:
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body:
|
|
|
|
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
|
|
|
|
%sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
|
|
|
|
%arrayidx = getelementptr inbounds float* %A, i64 %indvars.iv
|
|
|
|
%0 = load float* %arrayidx, align 4
|
|
|
|
%arrayidx2 = getelementptr inbounds float* %B, i64 %indvars.iv
|
|
|
|
%1 = load float* %arrayidx2, align 4
|
|
|
|
%cmp3 = fcmp ogt float %0, %1
|
|
|
|
br i1 %cmp3, label %if.then, label %for.inc
|
|
|
|
|
|
|
|
if.then:
|
|
|
|
%cmp6 = fcmp ogt float %1, 1.000000e+00
|
|
|
|
br i1 %cmp6, label %if.then8, label %if.else
|
|
|
|
|
|
|
|
if.then8:
|
|
|
|
%add = fadd fast float %sum.033, %0
|
|
|
|
br label %for.inc
|
|
|
|
|
|
|
|
if.else:
|
|
|
|
%cmp14 = fcmp ogt float %0, 2.000000e+00
|
|
|
|
br i1 %cmp14, label %if.then16, label %for.inc
|
|
|
|
|
|
|
|
if.then16:
|
|
|
|
%add19 = fadd fast float %sum.033, %1
|
|
|
|
br label %for.inc
|
|
|
|
|
|
|
|
for.inc:
|
|
|
|
%sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ]
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp ne i32 %lftr.wideiv, 128
|
|
|
|
br i1 %exitcond, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.end:
|
|
|
|
%sum.1.lcssa = phi float [ %sum.1, %for.inc ]
|
|
|
|
ret float %sum.1.lcssa
|
|
|
|
}
|
|
|
|
|
|
|
|
; We can't vectorize reductions that feed another header PHI.
|
|
|
|
; CHECK: noredux_header_phi
|
|
|
|
; CHECK-NOT: fadd <4 x float>
|
|
|
|
|
|
|
|
define float @noredux_header_phi(float* %A, float* %B, float* %C, float %S) {
|
|
|
|
entry:
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body:
|
|
|
|
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
|
|
|
%sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ]
|
|
|
|
%sum.08 = phi float [ %S, %entry ], [ %add, %for.body ]
|
|
|
|
%arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
|
|
|
|
%0 = load float* %arrayidx, align 4
|
|
|
|
%add = fadd fast float %sum.08, %0
|
|
|
|
%add1 = fadd fast float %sum2.09, %add
|
|
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
|
|
%exitcond = icmp ne i32 %lftr.wideiv, 128
|
|
|
|
br i1 %exitcond, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.end:
|
|
|
|
%add1.lcssa = phi float [ %add1, %for.body ]
|
|
|
|
%add.lcssa = phi float [ %add, %for.body ]
|
|
|
|
%add2 = fadd fast float %add.lcssa, %add1.lcssa
|
|
|
|
ret float %add2
|
|
|
|
}
|
LoopVectorizer: Disallow reductions whose header phi is used outside the loop
If an outside loop user of the reduction value uses the header phi node we
cannot just reduce the vectorized phi value in the vector code epilog because
we would loose VF-1 reductions.
lp:
p = phi (0, lv)
lv = lv + 1
...
brcond , lp, outside
outside:
usr = add 0, p
(Say the loop iterates two times, the value of p coming out of the loop is one).
We cannot just transform this to:
vlp:
p = phi (<0,0>, lv)
lv = lv + <1,1>
..
brcond , lp, outside
outside:
p_reduced = p[0] + [1];
usr = add 0, p_reduced
(Because the original loop iterated two times the vectorized loop would iterate
one time, but p_reduced ends up being zero instead of one).
We would have to execute VF-1 iterations in the scalar remainder loop in such
cases. For now, just disable vectorization.
PR16522
llvm-svn: 186256
2013-07-14 03:09:29 +08:00
|
|
|
|
|
|
|
|
|
|
|
; When vectorizing a reduction whose loop header phi value is used outside the
|
|
|
|
; loop special care must be taken. Otherwise, the reduced value feeding into the
|
|
|
|
; outside user misses a few iterations (VF-1) of the loop.
|
|
|
|
; PR16522
|
|
|
|
|
2013-07-14 09:42:54 +08:00
|
|
|
; CHECK-LABEL: @phivalueredux(
|
LoopVectorizer: Disallow reductions whose header phi is used outside the loop
If an outside loop user of the reduction value uses the header phi node we
cannot just reduce the vectorized phi value in the vector code epilog because
we would loose VF-1 reductions.
lp:
p = phi (0, lv)
lv = lv + 1
...
brcond , lp, outside
outside:
usr = add 0, p
(Say the loop iterates two times, the value of p coming out of the loop is one).
We cannot just transform this to:
vlp:
p = phi (<0,0>, lv)
lv = lv + <1,1>
..
brcond , lp, outside
outside:
p_reduced = p[0] + [1];
usr = add 0, p_reduced
(Because the original loop iterated two times the vectorized loop would iterate
one time, but p_reduced ends up being zero instead of one).
We would have to execute VF-1 iterations in the scalar remainder loop in such
cases. For now, just disable vectorization.
PR16522
llvm-svn: 186256
2013-07-14 03:09:29 +08:00
|
|
|
; CHECK-NOT: x i32>
|
|
|
|
|
|
|
|
define i32 @phivalueredux(i32 %p) {
|
|
|
|
entry:
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body:
|
|
|
|
%t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
|
|
|
%p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ]
|
|
|
|
%xor = xor i32 %p.addr.02, -1
|
|
|
|
%inc = add nsw i32 %t.03, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, 16
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end:
|
|
|
|
ret i32 %p.addr.02
|
|
|
|
}
|
2013-10-08 05:05:43 +08:00
|
|
|
|
|
|
|
; Don't vectorize a reduction value that is not the last in a reduction cyle. We
|
|
|
|
; would loose iterations (VF-1) on the operations after that use.
|
|
|
|
; PR17498
|
|
|
|
|
|
|
|
; CHECK-LABEL: not_last_operation
|
|
|
|
; CHECK-NOT: x i32>
|
|
|
|
define i32 @not_last_operation(i32 %p, i32 %val) {
|
|
|
|
entry:
|
|
|
|
%tobool = icmp eq i32 %p, 0
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body:
|
|
|
|
%inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ]
|
|
|
|
%inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ]
|
|
|
|
%0 = zext i1 %tobool to i32
|
|
|
|
%inc4.1 = xor i32 %0, 1
|
|
|
|
%inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1
|
|
|
|
%inc5.1 = add nsw i32 %inc511.1.inc4.1, 1
|
|
|
|
%inc6.1 = add nsw i32 %inc613.1, 1
|
|
|
|
%exitcond.1 = icmp eq i32 %inc6.1, 22
|
|
|
|
br i1 %exitcond.1, label %exit, label %for.body
|
|
|
|
|
|
|
|
exit:
|
|
|
|
%inc.2 = add nsw i32 %inc511.1.inc4.1, 2
|
|
|
|
ret i32 %inc.2
|
|
|
|
}
|