diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index da3ac06ab464..554944404708 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4146,8 +4146,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, if (AllowReorder && R.shouldReorder()) { // Conceptually, there is nothing actually preventing us from trying to // reorder a larger list. In fact, we do exactly this when vectorizing - // reductions. However, at this point, we only expect to get here from - // tryToVectorizePair(). + // reductions. However, at this point, we only expect to get here when + // there are exactly two operations. assert(Ops.size() == 2); assert(BuildVectorSlice.empty()); Value *ReorderedOps[] = {Ops[1], Ops[0]}; @@ -4904,7 +4904,13 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to vectorize them. unsigned NumElts = (SameTypeIt - IncIt); DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { + // The order in which the phi nodes appear in the program does not matter. + // So allow tryToVectorizeList to reorder them if it is beneficial. This + // is done when there are exactly two elements since tryToVectorizeList + // asserts that there are only two values when AllowReorder is true. + bool AllowReorder = NumElts == 2; + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, + None, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll new file mode 100644 index 000000000000..f7f58d7350b3 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=corei7-avx | FileCheck %s + +%struct.complex = type { float, float } + +; CHECK-LABEL: void @foo +define void @foo (%struct.complex* %A, %struct.complex* %B, %struct.complex* %Result) { + +entry: + %0 = add i64 256, 0 + br label %loop + +; CHECK-LABEL: loop +; CHECK: [[REG0:%[0-9]+]] = phi <2 x float> {{.*}}[ [[REG1:%[0-9]+]], %loop ] +; CHECK: [[REG2:%[0-9]+]] = load <2 x float>, <2 x float>* +; CHECK: [[REG3:%[0-9]+]] = fmul <2 x float> [[REG2]] +; CHECK: [[REG4:%[0-9]+]] = fmul <2 x float> +; CHECK: fsub <2 x float> [[REG3]], [[REG4]] +; CHECK: fadd <2 x float> [[REG3]], [[REG4]] +; CHECK: shufflevector <2 x float> +; CHECK: [[REG1]] = fadd <2 x float>{{.*}}[[REG0]] +loop: + + %1 = phi i64 [ 0, %entry ], [ %20, %loop ] + %2 = phi float [ 0.000000e+00, %entry ], [ %19, %loop ] + %3 = phi float [ 0.000000e+00, %entry ], [ %18, %loop ] + %4 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 0 + %5 = load float, float* %4, align 4 + %6 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 1 + %7 = load float, float* %6, align 4 + %8 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 0 + %9 = load float, float* %8, align 4 + %10 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 1 + %11 = load float, float* %10, align 4 + %12 = fmul float %5, %9 + %13 = fmul float %7, %11 + %14 = fsub float %12, %13 + %15 = fmul float %7, %9 + %16 = fmul float %5, %11 + %17 = fadd float %15, %16 + %18 = fadd float %3, %14 + %19 = fadd float %2, %17 + %20 = add nuw nsw i64 %1, 1 + %21 = icmp eq i64 %20, %0 + br i1 %21, label %exit, label %loop + +exit: + %22 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result, i32 0, i32 0 + store float %18, float* %22, align 4 + %23 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result, i32 0, i32 1 + store float %19, float* %23, align 4 + + ret void + +}