From 49bdbce8e1f2cdba74ba5d340451d1ec367113c8 Mon Sep 17 00:00:00 2001 From: James Molloy Date: Thu, 6 Sep 2012 09:55:02 +0000 Subject: [PATCH] Improve codegen for BUILD_VECTORs on ARM. If we have a BUILD_VECTOR that is mostly a constant splat, it is often better to splat that constant then insertelement the non-constant lanes instead of insertelementing every lane from an undef base. llvm-svn: 163304 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 66 +++++++++++++++++++++---- llvm/test/CodeGen/ARM/vdup.ll | 34 +++++++++++++ 2 files changed, 90 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index c17e9ae381e9..62c758931e1e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -4161,10 +4161,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, } // Scan through the operands to see if only one value is used. + // + // As an optimisation, even if more than one value is used it may be more + // profitable to splat with one value then change some lanes. + // + // Heuristically we decide to do this if the vector has a "dominant" value, + // defined as splatted to more than half of the lanes. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; + bool hasDominantValue = false; bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap ValueCounts; SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); @@ -4175,13 +4186,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (!isa(V) && !isa(V)) isConstant = false; - if (!Value.getNode()) + ValueCounts.insert(std::make_pair(V, 0)); + int &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; Value = V; - else if (V != Value) - usesOnlyOneValue = false; + } } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; - if (!Value.getNode()) + if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); if (isOnlyLowElement) @@ -4191,9 +4210,34 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Use VDUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. - if (usesOnlyOneValue && EltSize <= 32) { - if (!isConstant) - return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + if (hasDominantValue && EltSize <= 32) { + if (!isConstant) { + SDValue N; + + // If we are VDUPing a value that comes directly from a vector, that will + // cause an unnecessary move to and from a GPR, where instead we could + // just use VDUPLANE. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + Value->getOperand(0), Value->getOperand(1)); + else + N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i32)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3); + } + } + return N; + } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) @@ -4205,9 +4249,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); - if (Val.getNode()) - return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + if (usesOnlyOneValue) { + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (isConstant && Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + } } // If all elements are constants and the case above didn't get hit, fall back diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll index 05332e4d8c5b..a8c224b43856 100644 --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -261,3 +261,37 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind { store <8 x i8> %2, <8 x i8>* %ptr, align 8 ret void } + +define <4 x i32> @tdupi(i32 %x, i32 %y) { +;CHECK: tdupi +;CHECK: vdup.32 + %1 = insertelement <4 x i32> undef, i32 %x, i32 0 + %2 = insertelement <4 x i32> %1, i32 %x, i32 1 + %3 = insertelement <4 x i32> %2, i32 %x, i32 2 + %4 = insertelement <4 x i32> %3, i32 %y, i32 3 + ret <4 x i32> %4 +} + +define <4 x float> @tdupf(float %x, float %y) { +;CHECK: tdupf +;CHECK: vdup.32 + %1 = insertelement <4 x float> undef, float %x, i32 0 + %2 = insertelement <4 x float> %1, float %x, i32 1 + %3 = insertelement <4 x float> %2, float %x, i32 2 + %4 = insertelement <4 x float> %3, float %y, i32 3 + ret <4 x float> %4 +} + +; This test checks that when splatting an element from a vector into another, +; the value isn't moved out to GPRs first. +define <4 x i32> @tduplane(<4 x i32> %invec) { +;CHECK: tduplane +;CHECK-NOT: vmov {{.*}}, d16[1] +;CHECK: vdup.32 {{.*}}, d16[1] + %in = extractelement <4 x i32> %invec, i32 1 + %1 = insertelement <4 x i32> undef, i32 %in, i32 0 + %2 = insertelement <4 x i32> %1, i32 %in, i32 1 + %3 = insertelement <4 x i32> %2, i32 %in, i32 2 + %4 = insertelement <4 x i32> %3, i32 255, i32 3 + ret <4 x i32> %4 +}