Add a basic-block autovectorization pass.

This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure.
Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser).

llvm-svn: 149468
This commit is contained in:
Hal Finkel 2012-02-01 03:51:43 +00:00
parent 47b349328b
commit c34e51132c
35 changed files with 2635 additions and 12 deletions

View File

@ -126,6 +126,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print " <p>\n" if !
<tr><td><a href="#adce">-adce</a></td><td>Aggressive Dead Code Elimination</td></tr>
<tr><td><a href="#always-inline">-always-inline</a></td><td>Inliner for always_inline functions</td></tr>
<tr><td><a href="#argpromotion">-argpromotion</a></td><td>Promote 'by reference' arguments to scalars</td></tr>
<tr><td><a href="#bb-vectorize">-bb-vectorize</a></td><td>Combine instructions to form vector instructions within basic blocks</td></tr>
<tr><td><a href="#block-placement">-block-placement</a></td><td>Profile Guided Basic Block Placement</td></tr>
<tr><td><a href="#break-crit-edges">-break-crit-edges</a></td><td>Break critical edges in CFG</td></tr>
<tr><td><a href="#codegenprepare">-codegenprepare</a></td><td>Optimize for code generation</td></tr>
@ -815,6 +816,26 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print " <p>\n" if !
</p>
</div>
<!-------------------------------------------------------------------------- -->
<h3>
<a name="bb-vectorize">-bb-vectorize: Basic-Block Vectorization</a>
</h3>
<div>
<p>This pass combines instructions inside basic blocks to form vector
instructions. It iterates over each basic block, attempting to pair
compatible instructions, repeating this process until no additional
pairs are selected for vectorization. When the outputs of some pair
of compatible instructions are used as inputs by some other pair of
compatible instructions, those pairs are part of a potential
vectorization chain. Instruction pairs are only fused into vector
instructions when they are part of a chain longer than some
threshold length. Moreover, the pass attempts to find the best
possible chain for each pair of compatible instructions. These
heuristics are intended to prevent vectorization in cases where
it would not yield a performance increase of the resulting code.
</p>
</div>
<!-------------------------------------------------------------------------- -->
<h3>
<a name="block-placement">-block-placement: Profile Guided Basic Block Placement</a>

View File

@ -25,6 +25,7 @@ extern "C" {
void LLVMInitializeCore(LLVMPassRegistryRef R);
void LLVMInitializeTransformUtils(LLVMPassRegistryRef R);
void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
void LLVMInitializeVectorization(LLVMPassRegistryRef R);
void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
void LLVMInitializeIPO(LLVMPassRegistryRef R);
void LLVMInitializeInstrumentation(LLVMPassRegistryRef R);

View File

@ -0,0 +1,37 @@
/*===---------------------------Vectorize.h ------------------- -*- C++ -*-===*\
|*===----------- Vectorization Transformation Library C Interface ---------===*|
|* *|
|* The LLVM Compiler Infrastructure *|
|* *|
|* This file is distributed under the University of Illinois Open Source *|
|* License. See LICENSE.TXT for details. *|
|* *|
|*===----------------------------------------------------------------------===*|
|* *|
|* This header declares the C interface to libLLVMVectorize.a, which *|
|* implements various vectorization transformations of the LLVM IR. *|
|* *|
|* Many exotic languages can interoperate with C code but have a harder time *|
|* with C++ due to name mangling. So in addition to C, this interface enables *|
|* tools written in such languages. *|
|* *|
\*===----------------------------------------------------------------------===*/
#ifndef LLVM_C_TRANSFORMS_VECTORIZE_H
#define LLVM_C_TRANSFORMS_VECTORIZE_H
#include "llvm-c/Core.h"
#ifdef __cplusplus
extern "C" {
#endif
/** See llvm::createBBVectorizePass function. */
void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
#ifdef __cplusplus
}
#endif /* defined(__cplusplus) */
#endif

View File

@ -31,6 +31,10 @@ void initializeTransformUtils(PassRegistry&);
/// ScalarOpts library.
void initializeScalarOpts(PassRegistry&);
/// initializeVectorization - Initialize all passes linked into the
/// Vectorize library.
void initializeVectorization(PassRegistry&);
/// initializeInstCombine - Initialize all passes linked into the
/// ScalarOpts library.
void initializeInstCombine(PassRegistry&);
@ -236,7 +240,7 @@ void initializeVirtRegMapPass(PassRegistry&);
void initializeInstSimplifierPass(PassRegistry&);
void initializeUnpackMachineBundlesPass(PassRegistry&);
void initializeFinalizeMachineBundlesPass(PassRegistry&);
void initializeBBVectorizePass(PassRegistry&);
}
#endif

View File

@ -31,6 +31,7 @@
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Vectorize.h"
#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
#include <cstdlib>
@ -151,6 +152,7 @@ namespace {
(void) llvm::createCorrelatedValuePropagationPass();
(void) llvm::createMemDepPrinter();
(void) llvm::createInstructionSimplifierPass();
(void) llvm::createBBVectorizePass();
(void)new llvm::IntervalPartition();
(void)new llvm::FindUsedTypes();

View File

@ -99,6 +99,7 @@ public:
bool DisableSimplifyLibCalls;
bool DisableUnitAtATime;
bool DisableUnrollLoops;
bool Vectorize;
private:
/// ExtensionList - This is list of all of the extensions that are registered.

View File

@ -0,0 +1,30 @@
//===-- Vectorize.h - Vectorization Transformations -------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This header file defines prototypes for accessor functions that expose passes
// in the Vectorize transformations library.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_VECTORIZE_H
#define LLVM_TRANSFORMS_VECTORIZE_H
namespace llvm {
class BasicBlockPass;
//===----------------------------------------------------------------------===//
//
// BBVectorize - A basic-block vectorization pass.
//
BasicBlockPass *createBBVectorizePass();
} // End llvm namespace
#endif

View File

@ -3,4 +3,5 @@ add_subdirectory(Instrumentation)
add_subdirectory(InstCombine)
add_subdirectory(Scalar)
add_subdirectory(IPO)
add_subdirectory(Vectorize)
add_subdirectory(Hello)

View File

@ -20,4 +20,4 @@ type = Library
name = IPO
parent = Transforms
library_name = ipo
required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils
required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils

View File

@ -21,14 +21,20 @@
#include "llvm/DefaultPasses.h"
#include "llvm/PassManager.h"
#include "llvm/Analysis/Passes.h"
#include "llvm/Analysis/Verifier.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Vectorize.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/ManagedStatic.h"
using namespace llvm;
static cl::opt<bool>
RunVectorization("vectorize", cl::desc("Run vectorization passes"));
PassManagerBuilder::PassManagerBuilder() {
OptLevel = 2;
SizeLevel = 0;
@ -37,6 +43,7 @@ PassManagerBuilder::PassManagerBuilder() {
DisableSimplifyLibCalls = false;
DisableUnitAtATime = false;
DisableUnrollLoops = false;
Vectorize = RunVectorization;
}
PassManagerBuilder::~PassManagerBuilder() {
@ -172,6 +179,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
if (Vectorize) {
MPM.add(createBBVectorizePass());
MPM.add(createInstructionCombiningPass());
if (OptLevel > 1)
MPM.add(createGVNPass()); // Remove redundancies
}
MPM.add(createAggressiveDCEPass()); // Delete dead instructions
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
MPM.add(createInstructionCombiningPass()); // Clean up after everything.

View File

@ -16,7 +16,7 @@
;===------------------------------------------------------------------------===;
[common]
subdirectories = IPO InstCombine Instrumentation Scalar Utils
subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize
[component_0]
type = Group

View File

@ -8,7 +8,7 @@
##===----------------------------------------------------------------------===##
LEVEL = ../..
PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello
PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello
include $(LEVEL)/Makefile.config

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
add_llvm_library(LLVMVectorize
BBVectorize.cpp
Vectorize.cpp
)

View File

@ -0,0 +1,24 @@
;===- ./lib/Transforms/Scalar/LLVMBuild.txt --------------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
; This file is distributed under the University of Illinois Open Source
; License. See LICENSE.TXT for details.
;
;===------------------------------------------------------------------------===;
;
; This is an LLVMBuild description file for the components in this subdirectory.
;
; For more information on the LLVMBuild system, please see:
;
; http://llvm.org/docs/LLVMBuild.html
;
;===------------------------------------------------------------------------===;
[component_0]
type = Library
name = Vectorize
parent = Transforms
library_name = Vectorize
required_libraries = Analysis Core InstCombine Support Target TransformUtils

View File

@ -0,0 +1,15 @@
##===- lib/Transforms/Vectorize/Makefile -----------------*- Makefile -*-===##
#
# The LLVM Compiler Infrastructure
#
# This file is distributed under the University of Illinois Open Source
# License. See LICENSE.TXT for details.
#
##===----------------------------------------------------------------------===##
LEVEL = ../../..
LIBRARYNAME = LLVMVectorize
BUILD_ARCHIVE = 1
include $(LEVEL)/Makefile.common

View File

@ -0,0 +1,39 @@
//===-- Vectorize.cpp -----------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
// implements several vectorization transformations over the LLVM intermediate
// representation, including the C bindings for that library.
//
//===----------------------------------------------------------------------===//
#include "llvm-c/Transforms/Vectorize.h"
#include "llvm-c/Initialization.h"
#include "llvm/InitializePasses.h"
#include "llvm/PassManager.h"
#include "llvm/Analysis/Passes.h"
#include "llvm/Analysis/Verifier.h"
#include "llvm/Transforms/Vectorize.h"
using namespace llvm;
/// initializeVectorizationPasses - Initialize all passes linked into the
/// Vectorization library.
void llvm::initializeVectorization(PassRegistry &Registry) {
initializeBBVectorizePass(Registry);
}
void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
initializeVectorization(*unwrap(R));
}
void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createBBVectorizePass());
}

View File

@ -0,0 +1,112 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise
; want to select the pairs:
; %div77 = fdiv double %sub74, %mul76.v.r1 <-> %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117)
; %add84 = fadd double %sub83, 2.000000e+00 <-> %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77)
; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <-> %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84)
; %mul117 = fmul double %sub39.v.r1, %sub116 <-> %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95)
; and so a dependency cycle would be created.
declare double @fabs(double) nounwind readnone
define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) {
entry:
br label %go
go:
%conv = sitofp i32 %n.0 to double
%add35 = fadd double %conv, %a
%sub36 = fadd double %add35, -1.000000e+00
%add38 = fadd double %conv, %b
%sub39 = fadd double %add38, -1.000000e+00
%add41 = fadd double %conv, %c
%sub42 = fadd double %add41, -1.000000e+00
%sub45 = fadd double %add35, -2.000000e+00
%sub48 = fadd double %add38, -2.000000e+00
%sub51 = fadd double %add41, -2.000000e+00
%mul52 = shl nsw i32 %n.0, 1
%sub53 = add nsw i32 %mul52, -1
%conv54 = sitofp i32 %sub53 to double
%sub56 = add nsw i32 %mul52, -3
%conv57 = sitofp i32 %sub56 to double
%sub59 = add nsw i32 %mul52, -5
%conv60 = sitofp i32 %sub59 to double
%mul61 = mul nsw i32 %n.0, %n.0
%conv62 = sitofp i32 %mul61 to double
%mul63 = fmul double %conv62, 3.000000e+00
%mul67 = fmul double %sub65, %conv
%add68 = fadd double %mul63, %mul67
%add69 = fadd double %add68, 2.000000e+00
%sub71 = fsub double %add69, %mul2.v.r1
%sub74 = fsub double %sub71, %mul73
%mul75 = fmul double %conv57, 2.000000e+00
%mul76 = fmul double %mul75, %sub42
%div77 = fdiv double %sub74, %mul76
%mul82 = fmul double %add80, %conv
%sub83 = fsub double %mul63, %mul82
%add84 = fadd double %sub83, 2.000000e+00
%sub86 = fsub double %add84, %mul2.v.r1
%sub87 = fsub double -0.000000e+00, %sub86
%mul88 = fmul double %sub36, %sub87
%mul89 = fmul double %mul88, %sub39
%mul90 = fmul double %conv54, 4.000000e+00
%mul91 = fmul double %mul90, %conv57
%mul92 = fmul double %mul91, %sub51
%mul93 = fmul double %mul92, %sub42
%div94 = fdiv double %mul89, %mul93
%mul95 = fmul double %sub45, %sub36
%mul96 = fmul double %mul95, %sub48
%mul97 = fmul double %mul96, %sub39
%sub99 = fsub double %conv, %a
%sub100 = fadd double %sub99, -2.000000e+00
%mul101 = fmul double %mul97, %sub100
%sub103 = fsub double %conv, %b
%sub104 = fadd double %sub103, -2.000000e+00
%mul105 = fmul double %mul101, %sub104
%mul106 = fmul double %conv57, 8.000000e+00
%mul107 = fmul double %mul106, %conv57
%mul108 = fmul double %mul107, %conv60
%sub111 = fadd double %add41, -3.000000e+00
%mul112 = fmul double %mul108, %sub111
%mul113 = fmul double %mul112, %sub51
%mul114 = fmul double %mul113, %sub42
%div115 = fdiv double %mul105, %mul114
%sub116 = fsub double -0.000000e+00, %sub36
%mul117 = fmul double %sub39, %sub116
%sub119 = fsub double %conv, %c
%sub120 = fadd double %sub119, -1.000000e+00
%mul121 = fmul double %mul117, %sub120
%mul123 = fmul double %mul75, %sub51
%mul124 = fmul double %mul123, %sub42
%div125 = fdiv double %mul121, %mul124
%mul126 = fmul double %div77, %sub
%add127 = fadd double %mul126, 1.000000e+00
%mul128 = fmul double %add127, %Anm1.0
%mul129 = fmul double %div94, %sub
%add130 = fadd double %div125, %mul129
%mul131 = fmul double %add130, %sub
%mul132 = fmul double %mul131, %Anm2.0
%add133 = fadd double %mul128, %mul132
%mul134 = fmul double %div115, %mul1
%mul135 = fmul double %mul134, %Anm3.0
%add136 = fadd double %add133, %mul135
%mul139 = fmul double %add127, %Bnm1.0
%mul143 = fmul double %mul131, %Bnm2.0
%add144 = fadd double %mul139, %mul143
%mul146 = fmul double %mul134, %Bnm3.0
%add147 = fadd double %add144, %mul146
%div148 = fdiv double %add136, %add147
%sub149 = fsub double %F.0, %div148
%div150 = fdiv double %sub149, %F.0
%call = tail call double @fabs(double %div150) nounwind readnone
%cmp = fcmp olt double %call, 0x3CB0000000000000
%cmp152 = icmp sgt i32 %n.0, 20000
%or.cond = or i1 %cmp, %cmp152
br i1 %or.cond, label %done, label %go
done:
ret void
; CHECK: @test1
; CHECK: go:
; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
; FIXME: When tree pruning is deterministic, include the entire output.
}

View File

@ -0,0 +1,3 @@
load_lib llvm.exp
RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]

View File

@ -0,0 +1,41 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
entry:
%i0 = load double* %a, align 8
%i1 = load double* %b, align 8
%mul = fmul double %i0, %i1
%i2 = load double* %c, align 8
%add = fadd double %mul, %i2
%arrayidx3 = getelementptr inbounds double* %a, i64 1
%i3 = load double* %arrayidx3, align 8
%arrayidx4 = getelementptr inbounds double* %b, i64 1
%i4 = load double* %arrayidx4, align 8
%mul5 = fmul double %i3, %i4
%arrayidx6 = getelementptr inbounds double* %c, i64 1
%i5 = load double* %arrayidx6, align 8
%add7 = fadd double %mul5, %i5
%mul9 = fmul double %add, %i1
%add11 = fadd double %mul9, %i2
%mul13 = fmul double %add7, %i4
%add15 = fadd double %mul13, %i5
%mul16 = fmul double %add11, %add15
ret double %mul16
; CHECK: @test1
; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>*
; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
; CHECK: %mul = fmul <2 x double> %i0, %i1
; CHECK: %i2 = load <2 x double>* %i2.v.i0, align 8
; CHECK: %add = fadd <2 x double> %mul, %i2
; CHECK: %mul9 = fmul <2 x double> %add, %i1
; CHECK: %add11 = fadd <2 x double> %mul9, %i2
; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0
; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1
; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2
; CHECK: ret double %mul16
}

View File

@ -0,0 +1,93 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
; The second check covers the use of alias analysis (with loop unrolling).
define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
entry:
br label %for.body
; CHECK: @test1
; CHECK-UNRL: @test1
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
%0 = load double* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
%1 = load double* %arrayidx2, align 8
%mul = fmul double %0, %0
%mul3 = fmul double %0, %1
%add = fadd double %mul, %mul3
%add4 = fadd double %1, %1
%add5 = fadd double %add4, %0
%mul6 = fmul double %0, %add5
%add7 = fadd double %add, %mul6
%mul8 = fmul double %1, %1
%add9 = fadd double %0, %0
%add10 = fadd double %add9, %0
%mul11 = fmul double %mul8, %add10
%add12 = fadd double %add7, %mul11
%arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
store double %add12, double* %arrayidx14, align 8
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 10
br i1 %exitcond, label %for.end, label %for.body
; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
; CHECK: %0 = load double* %arrayidx, align 8
; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
; CHECK: %1 = load double* %arrayidx2, align 8
; CHECK: %mul = fmul double %0, %0
; CHECK: %mul3 = fmul double %0, %1
; CHECK: %add = fadd double %mul, %mul3
; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
; CHECK: %mul8 = fmul double %1, %1
; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1
; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2
; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1
; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5
; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0
; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1
; CHECK: %add7 = fadd double %add, %mul6.v.r1
; CHECK: %add12 = fadd double %add7, %mul6.v.r2
; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
; CHECK: store double %add12, double* %arrayidx14, align 8
; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1
; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10
; CHECK: br i1 %exitcond, label %for.end, label %for.body
; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*
; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*
; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
; CHECK-UNRL: %2 = load <2 x double>* %0, align 8
; CHECK-UNRL: %3 = load <2 x double>* %1, align 8
; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2
; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}

View File

@ -0,0 +1,17 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2
define double @test1(double %A1, double %A2, double %B1, double %B2) {
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
%Y1 = fmul double %X1, %A1
%Y2 = fmul double %X2, %A2
%R = fmul double %Y1, %Y2
ret double %R
; CHECK-RD3: @test1
; CHECK-RD2: @test1
; CHECK-RD3-NOT: <2 x double>
; CHECK-RD2: <2 x double>
}

View File

@ -0,0 +1,46 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
define double @test1(double %A1, double %A2, double %B1, double %B2) {
; CHECK: @test1
; CHECK-SL4: @test1
; CHECK-SL4-NOT: <2 x double>
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
%Y1 = fmul double %X1, %A1
%Y2 = fmul double %X2, %A2
; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
%Z1 = fadd double %Y1, %B1
; Here we have a dependency chain: the short search limit will not
; see past this chain and so will not see the second part of the
; pair to vectorize.
%mul41 = fmul double %Z1, %Y2
%sub48 = fsub double %Z1, %mul41
%mul62 = fmul double %Z1, %sub48
%sub69 = fsub double %Z1, %mul62
%mul83 = fmul double %Z1, %sub69
%sub90 = fsub double %Z1, %mul83
%mul104 = fmul double %Z1, %sub90
%sub111 = fsub double %Z1, %mul104
%mul125 = fmul double %Z1, %sub111
%sub132 = fsub double %Z1, %mul125
%mul146 = fmul double %Z1, %sub132
%sub153 = fsub double %Z1, %mul146
; end of chain.
%Z2 = fadd double %Y2, %B2
; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
%R1 = fdiv double %Z1, %Z2
%R = fmul double %R1, %sub153
; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2
ret double %R
; CHECK: ret double %R
}

View File

@ -0,0 +1,59 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
declare double @llvm.fma.f64(double, double, double)
declare double @llvm.cos.f64(double)
; Basic depth-3 chain with fma
define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
%Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
%Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
%Z1 = fadd double %Y1, %B1
%Z2 = fadd double %Y2, %B2
%R = fmul double %Z1, %Z2
ret double %R
; CHECK: @test1
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1
; CHECK: %Y1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2)
; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
; CHECK: ret double %R
}
; Basic depth-3 chain with cos
define double @test2(double %A1, double %A2, double %B1, double %B2) {
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
%Y1 = call double @llvm.cos.f64(double %X1)
%Y2 = call double @llvm.cos.f64(double %X2)
%Z1 = fadd double %Y1, %B1
%Z2 = fadd double %Y2, %B2
%R = fmul double %Z1, %Z2
ret double %R
; CHECK: @test2
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
; CHECK: ret double %R
}
; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly

View File

@ -0,0 +1,110 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
; Simple 3-pair chain with loads and stores
define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
entry:
%i0 = load double* %a, align 8
%i1 = load double* %b, align 8
%mul = fmul double %i0, %i1
%arrayidx3 = getelementptr inbounds double* %a, i64 1
%i3 = load double* %arrayidx3, align 8
%arrayidx4 = getelementptr inbounds double* %b, i64 1
%i4 = load double* %arrayidx4, align 8
%mul5 = fmul double %i3, %i4
store double %mul, double* %c, align 8
%arrayidx5 = getelementptr inbounds double* %c, i64 1
store double %mul5, double* %arrayidx5, align 8
ret void
; CHECK: @test1
; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
; CHECK: %mul = fmul <2 x double> %i0, %i1
; CHECK: %0 = bitcast double* %c to <2 x double>*
; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
; CHECK: ret void
; CHECK-AO: @test1
; CHECK-AO-NOT: <2 x double>
}
; Simple chain with extending loads and stores
define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly {
entry:
%i0f = load float* %a, align 4
%i0 = fpext float %i0f to double
%i1f = load float* %b, align 4
%i1 = fpext float %i1f to double
%mul = fmul double %i0, %i1
%arrayidx3 = getelementptr inbounds float* %a, i64 1
%i3f = load float* %arrayidx3, align 4
%i3 = fpext float %i3f to double
%arrayidx4 = getelementptr inbounds float* %b, i64 1
%i4f = load float* %arrayidx4, align 4
%i4 = fpext float %i4f to double
%mul5 = fmul double %i3, %i4
store double %mul, double* %c, align 8
%arrayidx5 = getelementptr inbounds double* %c, i64 1
store double %mul5, double* %arrayidx5, align 8
ret void
; CHECK: @test2
; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>*
; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>*
; CHECK: %i0f = load <2 x float>* %i0f.v.i0, align 4
; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double>
; CHECK: %i1f = load <2 x float>* %i1f.v.i0, align 4
; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double>
; CHECK: %mul = fmul <2 x double> %i0, %i1
; CHECK: %0 = bitcast double* %c to <2 x double>*
; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
; CHECK: ret void
; CHECK-AO: @test2
; CHECK-AO-NOT: <2 x double>
}
; Simple chain with loads and truncating stores
define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly {
entry:
%i0 = load double* %a, align 8
%i1 = load double* %b, align 8
%mul = fmul double %i0, %i1
%mulf = fptrunc double %mul to float
%arrayidx3 = getelementptr inbounds double* %a, i64 1
%i3 = load double* %arrayidx3, align 8
%arrayidx4 = getelementptr inbounds double* %b, i64 1
%i4 = load double* %arrayidx4, align 8
%mul5 = fmul double %i3, %i4
%mul5f = fptrunc double %mul5 to float
store float %mulf, float* %c, align 8
%arrayidx5 = getelementptr inbounds float* %c, i64 1
store float %mul5f, float* %arrayidx5, align 4
ret void
; CHECK: @test3
; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
; CHECK: %mul = fmul <2 x double> %i0, %i1
; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float>
; CHECK: %0 = bitcast float* %c to <2 x float>*
; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8
; CHECK: ret void
; CHECK-AO: @test3
; CHECK-AO: %i0 = load double* %a, align 8
; CHECK-AO: %i1 = load double* %b, align 8
; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
; CHECK-AO: %i3 = load double* %arrayidx3, align 8
; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
; CHECK-AO: %i4 = load double* %arrayidx4, align 8
; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
; CHECK-AO: %0 = bitcast float* %c to <2 x float>*
; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
; CHECK-AO: ret void
}

View File

@ -0,0 +1,152 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
; Basic depth-3 chain
define double @test1(double %A1, double %A2, double %B1, double %B2) {
; CHECK: @test1
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
%Y1 = fmul double %X1, %A1
%Y2 = fmul double %X2, %A2
; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
%Z1 = fadd double %Y1, %B1
%Z2 = fadd double %Y2, %B2
; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
%R = fmul double %Z1, %Z2
; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
ret double %R
; CHECK: ret double %R
}
; Basic depth-3 chain (last pair permuted)
define double @test2(double %A1, double %A2, double %B1, double %B2) {
; CHECK: @test2
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
%Y1 = fmul double %X1, %A1
%Y2 = fmul double %X2, %A2
; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
%Z1 = fadd double %Y2, %B1
%Z2 = fadd double %Y1, %B2
; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
%R = fmul double %Z1, %Z2
; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
ret double %R
; CHECK: ret double %R
}
; Basic depth-3 chain (last pair first splat)
define double @test3(double %A1, double %A2, double %B1, double %B2) {
; CHECK: @test3
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
%Y1 = fmul double %X1, %A1
%Y2 = fmul double %X2, %A2
; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
%Z1 = fadd double %Y2, %B1
%Z2 = fadd double %Y2, %B2
; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 1>
; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
%R = fmul double %Z1, %Z2
; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
ret double %R
; CHECK: ret double %R
}
; Basic depth-3 chain (last pair second splat)
define double @test4(double %A1, double %A2, double %B1, double %B2) {
; CHECK: @test4
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
%Y1 = fmul double %X1, %A1
%Y2 = fmul double %X2, %A2
; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
%Z1 = fadd double %Y1, %B1
%Z2 = fadd double %Y1, %B2
; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> zeroinitializer
; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
%R = fmul double %Z1, %Z2
; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
ret double %R
; CHECK: ret double %R
}
; Basic depth-3 chain
define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) {
; CHECK: @test5
; CHECK: %X1.v.i1 = shufflevector <2 x float> %B1, <2 x float> %B2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK: %X1.v.i0 = shufflevector <2 x float> %A1, <2 x float> %A2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%X1 = fsub <2 x float> %A1, %B1
%X2 = fsub <2 x float> %A2, %B2
; CHECK: %X1 = fsub <4 x float> %X1.v.i0, %X1.v.i1
%Y1 = fmul <2 x float> %X1, %A1
%Y2 = fmul <2 x float> %X2, %A2
; CHECK: %Y1 = fmul <4 x float> %X1, %X1.v.i0
%Z1 = fadd <2 x float> %Y1, %B1
%Z2 = fadd <2 x float> %Y2, %B2
; CHECK: %Z1 = fadd <4 x float> %Y1, %X1.v.i1
%R = fmul <2 x float> %Z1, %Z2
; CHECK: %Z1.v.r1 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
; CHECK: %Z1.v.r2 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
; CHECK: %R = fmul <2 x float> %Z1.v.r1, %Z1.v.r2
ret <2 x float> %R
; CHECK: ret <2 x float> %R
}
; Basic chain with shuffles
define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
; CHECK: @test6
; CHECK: %X1.v.i1 = shufflevector <8 x i8> %B1, <8 x i8> %B2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK: %X1.v.i0 = shufflevector <8 x i8> %A1, <8 x i8> %A2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%X1 = sub <8 x i8> %A1, %B1
%X2 = sub <8 x i8> %A2, %B2
; CHECK: %X1 = sub <16 x i8> %X1.v.i0, %X1.v.i1
%Y1 = mul <8 x i8> %X1, %A1
%Y2 = mul <8 x i8> %X2, %A2
; CHECK: %Y1 = mul <16 x i8> %X1, %X1.v.i0
%Z1 = add <8 x i8> %Y1, %B1
%Z2 = add <8 x i8> %Y2, %B2
; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1
%Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
%Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15>
; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> <i32 23, i32 16, i32 6, i32 1, i32 21, i32 18, i32 4, i32 3, i32 14, i32 15, i32 8, i32 9, i32 10, i32 12, i32 12, i32 9>
%R = mul <8 x i8> %Q1, %Q2
; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK: %Q1.v.r2 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2
ret <8 x i8> %R
; CHECK: ret <8 x i8> %R
}

View File

@ -1,5 +1,5 @@
set(LLVM_LINK_COMPONENTS asmparser instrumentation scalaropts ipo
linker bitreader bitwriter)
linker bitreader bitwriter vectorize)
add_llvm_tool(bugpoint
BugDriver.cpp

View File

@ -10,6 +10,6 @@
LEVEL := ../..
TOOLNAME := bugpoint
LINK_COMPONENTS := asmparser instrumentation scalaropts ipo linker bitreader \
bitwriter
bitwriter vectorize
include $(LEVEL)/Makefile.common

View File

@ -1,4 +1,4 @@
set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter)
set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter vectorize)
add_llvm_tool(llvm-ld
Optimize.cpp

View File

@ -9,6 +9,6 @@
LEVEL := ../..
TOOLNAME := llvm-ld
LINK_COMPONENTS := ipo scalaropts linker archive bitwriter
LINK_COMPONENTS := ipo scalaropts linker archive bitwriter vectorize
include $(LEVEL)/Makefile.common

View File

@ -1,6 +1,6 @@
set(LLVM_LINK_COMPONENTS
${LLVM_TARGETS_TO_BUILD}
ipo scalaropts linker bitreader bitwriter mcdisassembler)
ipo scalaropts linker bitreader bitwriter mcdisassembler vectorize)
add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" )

View File

@ -10,7 +10,7 @@
LEVEL := ../..
LIBRARYNAME := LTO
LINK_COMPONENTS := all-targets ipo scalaropts linker bitreader bitwriter \
mcdisassembler
mcdisassembler vectorize
LINK_LIBS_IN_SHARED := 1
SHARED_LIBRARY := 1

View File

@ -1,4 +1,4 @@
set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo)
set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize)
add_llvm_tool(opt
AnalysisWrappers.cpp

View File

@ -9,6 +9,6 @@
LEVEL := ../..
TOOLNAME := opt
LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo
LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize
include $(LEVEL)/Makefile.common

View File

@ -480,6 +480,7 @@ int main(int argc, char **argv) {
PassRegistry &Registry = *PassRegistry::getPassRegistry();
initializeCore(Registry);
initializeScalarOpts(Registry);
initializeVectorization(Registry);
initializeIPO(Registry);
initializeAnalysis(Registry);
initializeIPA(Registry);