[VectorCombine] position pass after SLP in the optimization pipeline rather than before

There are 2 known problem patterns shown in the test diffs here:
vector horizontal ops (an x86 specialization) and vector reductions.

SLP has greater ability to match and fold those than vector-combine,
so let SLP have first chance at that.

This is a quick fix while we continue to improve vector-combine and
possibly canonicalize to reduction intrinsics.

In the longer term, we should improve matching of these patterns
because if they were created in the "bad" forms shown here, then we
would miss optimizing them.

I'm not sure what is happening with alias analysis on the addsub test.
The old pass manager now shows an extra line for that, and we see an
improvement that comes from SLP vectorizing a store. I don't know
what's missing with the new pass manager to make that happen.
Strangely, I can't reproduce the behavior if I compile from C++ with
clang and invoke the new PM with "-fexperimental-new-pass-manager".

Differential Revision: https://reviews.llvm.org/D80236
This commit is contained in:
Sanjay Patel 2020-05-22 12:13:18 -04:00
parent 22ed724975
commit 6438ea45e0
14 changed files with 79 additions and 84 deletions

View File

@ -986,10 +986,6 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
OptimizePM.addPass(LoopVectorizePass(
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
// Enhance/cleanup vector code.
OptimizePM.addPass(VectorCombinePass());
OptimizePM.addPass(EarlyCSEPass());
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
OptimizePM.addPass(LoopLoadEliminationPass());
@ -1016,6 +1012,9 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
if (PTO.SLPVectorization)
OptimizePM.addPass(SLPVectorizerPass());
// Enhance/cleanup vector code.
OptimizePM.addPass(VectorCombinePass());
OptimizePM.addPass(EarlyCSEPass());
OptimizePM.addPass(InstCombinePass());
// Unroll small loops to hide loop backedge latency and saturate any parallel

View File

@ -741,8 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
MPM.add(createLoopDistributePass());
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
MPM.add(createVectorCombinePass());
MPM.add(createEarlyCSEPass());
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
@ -783,6 +781,10 @@ void PassManagerBuilder::populateModulePassManager(
}
}
// Enhance/cleanup vector code.
MPM.add(createVectorCombinePass());
MPM.add(createEarlyCSEPass());
addExtensionsToPM(EP_Peephole, MPM);
MPM.add(createInstructionCombiningPass());

View File

@ -230,8 +230,6 @@
; GCN-O1-NEXT: Optimization Remark Emitter
; GCN-O1-NEXT: Inject TLI Mappings
; GCN-O1-NEXT: Loop Vectorization
; GCN-O1-NEXT: Optimize scalar/vector ops
; GCN-O1-NEXT: Early CSE
; GCN-O1-NEXT: Canonicalize natural loops
; GCN-O1-NEXT: Scalar Evolution Analysis
; GCN-O1-NEXT: Function Alias Analysis Results
@ -247,6 +245,8 @@
; GCN-O1-NEXT: Combine redundant instructions
; GCN-O1-NEXT: Simplify the CFG
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Optimize scalar/vector ops
; GCN-O1-NEXT: Early CSE
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Natural Loop Information
@ -571,8 +571,6 @@
; GCN-O2-NEXT: Optimization Remark Emitter
; GCN-O2-NEXT: Inject TLI Mappings
; GCN-O2-NEXT: Loop Vectorization
; GCN-O2-NEXT: Optimize scalar/vector ops
; GCN-O2-NEXT: Early CSE
; GCN-O2-NEXT: Canonicalize natural loops
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: Function Alias Analysis Results
@ -598,6 +596,9 @@
; GCN-O2-NEXT: Optimization Remark Emitter
; GCN-O2-NEXT: Inject TLI Mappings
; GCN-O2-NEXT: SLP Vectorizer
; GCN-O2-NEXT: Optimize scalar/vector ops
; GCN-O2-NEXT: Early CSE
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Optimization Remark Emitter
; GCN-O2-NEXT: Combine redundant instructions
; GCN-O2-NEXT: Canonicalize natural loops
@ -924,8 +925,6 @@
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Inject TLI Mappings
; GCN-O3-NEXT: Loop Vectorization
; GCN-O3-NEXT: Optimize scalar/vector ops
; GCN-O3-NEXT: Early CSE
; GCN-O3-NEXT: Canonicalize natural loops
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: Function Alias Analysis Results
@ -951,6 +950,9 @@
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Inject TLI Mappings
; GCN-O3-NEXT: SLP Vectorizer
; GCN-O3-NEXT: Optimize scalar/vector ops
; GCN-O3-NEXT: Early CSE
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Combine redundant instructions
; GCN-O3-NEXT: Canonicalize natural loops

View File

@ -253,8 +253,6 @@
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: InstCombinePass
@ -262,6 +260,8 @@
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -223,8 +223,6 @@
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
@ -232,6 +230,8 @@
; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -191,8 +191,6 @@
; CHECK-O-NEXT: Running pass: LoopDistributePass
; CHECK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: InstCombinePass
@ -200,6 +198,8 @@
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -202,8 +202,6 @@
; CHECK-O-NEXT: Running pass: LoopDistributePass
; CHECK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: InstCombinePass
@ -211,6 +209,8 @@
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -227,8 +227,6 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
@ -254,6 +252,9 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops

View File

@ -232,8 +232,6 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
@ -259,6 +257,9 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops

View File

@ -213,8 +213,6 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
@ -240,6 +238,9 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops

View File

@ -12,15 +12,15 @@
; OLDPM_O1-LABEL: Pass Arguments:
; OLDPM_O1: Loop Vectorization
; OLDPM_O1: Optimize scalar/vector ops
; OLDPM_O1-NOT: SLP Vectorizer
; OLDPM_O1: Optimize scalar/vector ops
; Everything runs at -O2.
; OLDPM_O2-LABEL: Pass Arguments:
; OLDPM_O2: Loop Vectorization
; OLDPM_O2: Optimize scalar/vector ops
; OLDPM_O2: SLP Vectorizer
; OLDPM_O2: Optimize scalar/vector ops
; The loop vectorizer still runs at both -O1/-O2 even with the
; debug flag, but it only works on loops explicitly annotated
@ -28,24 +28,24 @@
; OLDPM_O1_FORCE_OFF-LABEL: Pass Arguments:
; OLDPM_O1_FORCE_OFF: Loop Vectorization
; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops
; OLDPM_O1_FORCE_OFF-NOT: SLP Vectorizer
; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops
; OLDPM_O2_FORCE_OFF-LABEL: Pass Arguments:
; OLDPM_O2_FORCE_OFF: Loop Vectorization
; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops
; OLDPM_O2_FORCE_OFF: SLP Vectorizer
; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops
; There should be no difference with the new pass manager.
; This is tested more thoroughly in other test files.
; NEWPM_O1-LABEL: Running pass: LoopVectorizePass
; NEWPM_O1: Running pass: VectorCombinePass
; NEWPM_O1-NOT: Running pass: SLPVectorizerPass
; NEWPM_O1: Running pass: VectorCombinePass
; NEWPM_O2-LABEL: Running pass: LoopVectorizePass
; NEWPM_O2: Running pass: VectorCombinePass
; NEWPM_O2: Running pass: SLPVectorizerPass
; NEWPM_O2: Running pass: VectorCombinePass
define void @f() {
ret void

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -O3 -S | FileCheck %s
; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
; RUN: opt < %s -O3 -S | FileCheck %s --check-prefixes=CHECK,OLDPM
; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,NEWPM
target triple = "x86_64--"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@ -69,22 +69,30 @@ define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float>
}
define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
; CHECK-LABEL: @add_aggregate_store(
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4
; CHECK-NEXT: ret void
; OLDPM-LABEL: @add_aggregate_store(
; OLDPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
; OLDPM-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; OLDPM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; OLDPM-NEXT: [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>*
; OLDPM-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
; OLDPM-NEXT: ret void
;
; NEWPM-LABEL: @add_aggregate_store(
; NEWPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
; NEWPM-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
; NEWPM-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
; NEWPM-NEXT: store float [[TMP2]], float* [[R0]], align 4
; NEWPM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
; NEWPM-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
; NEWPM-NEXT: store float [[TMP3]], float* [[R1]], align 4
; NEWPM-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; NEWPM-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
; NEWPM-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
; NEWPM-NEXT: store float [[TMP5]], float* [[R2]], align 4
; NEWPM-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
; NEWPM-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
; NEWPM-NEXT: store float [[TMP6]], float* [[R3]], align 4
; NEWPM-NEXT: ret void
;
%a00 = extractelement <2 x float> %a0, i32 0
%b00 = extractelement <2 x float> %b0, i32 0

View File

@ -9,18 +9,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: @hadd_reverse_v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
; CHECK-NEXT: [[VECINIT6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], [[B]]
; CHECK-NEXT: [[VECINIT10:%.*]] = shufflevector <4 x float> [[VECINIT6]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[B]]
; CHECK-NEXT: [[VECINIT14:%.*]] = shufflevector <4 x float> [[VECINIT10]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; CHECK-NEXT: ret <4 x float> [[VECINIT14]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret <4 x float> [[TMP3]]
;
%shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@ -45,18 +37,11 @@ define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: @reverse_hadd_v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], [[B]]
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x i32> <i32 undef, i32 4, i32 2, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], [[B]]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP10]], <4 x i32> <i32 6, i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <4 x float> [[TMP11]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: ret <4 x float> [[TMP4]]
;
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1

View File

@ -5,19 +5,15 @@
target triple = "x86_64--"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; FIXME: This should only need 2 'or' instructions.
define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
; CHECK-NEXT: ret i32 [[TMP7]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[Z]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
; CHECK-NEXT: ret i32 [[TMP1]]
;
%z = and <4 x i32> %x, %y
%z0 = extractelement <4 x i32> %z, i32 0