From cadcfed7aac7be93ec0fded17ac81d3517c7ddf3 Mon Sep 17 00:00:00 2001 From: Adhemerval Zanella Date: Wed, 27 Jun 2018 13:58:46 +0000 Subject: [PATCH] [AArch64] Add custom lowering for v4i8 trunc store This patch adds a custom trunc store lowering for v4i8 vector types. Since there is not v.4b register, the v4i8 is promoted to v4i16 (v.4h) and default action for v4i8 is to extract each element and issue 4 byte stores. A better strategy would be to extended the promoted v4i16 to v8i16 (with undef elements) and extract and store the word lane which represents the v4i8 subvectores. The construction: define void @foo(<4 x i16> %x, i8* nocapture %p) { %0 = trunc <4 x i16> %x to <4 x i8> %1 = bitcast i8* %p to <4 x i8>* store <4 x i8> %0, <4 x i8>* %1, align 4, !tbaa !2 ret void } Can be optimized from: umov w8, v0.h[3] umov w9, v0.h[2] umov w10, v0.h[1] umov w11, v0.h[0] strb w8, [x0, #3] strb w9, [x0, #2] strb w10, [x0, #1] strb w11, [x0] ret To: xtn v0.8b, v0.8h str s0, [x0] ret The patch also adjust the memory cost for autovectorization, so the C code: void foo (const int *src, int width, unsigned char *dst) { for (int i = 0; i < width; i++) *dst++ = *src++; } can be vectorized to: .LBB0_4: // %vector.body // =>This Inner Loop Header: Depth=1 ldr q0, [x0], #16 subs x12, x12, #4 // =4 xtn v0.4h, v0.4s xtn v0.8b, v0.8h st1 { v0.s }[0], [x2], #4 b.ne .LBB0_4 Instead of byte operations. llvm-svn: 335735 --- .../Target/AArch64/AArch64ISelLowering.cpp | 66 +++++++++++++++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + .../AArch64/AArch64TargetTransformInfo.cpp | 24 ++++--- llvm/test/Analysis/CostModel/AArch64/store.ll | 2 +- .../AArch64/neon-truncStore-extLoad.ll | 10 +++ .../AArch64/interleaved-vs-scalar.ll | 1 - 6 files changed, 95 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 884048041544..0c72f2ebee18 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -742,6 +742,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } + + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -2673,6 +2675,68 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. +static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, + EVT VT, EVT MemVT, + SelectionDAG &DAG) { + assert(VT.isVector() && "VT should be a vector type"); + assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); + + SDValue Value = ST->getValue(); + + // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract + // the word lane which represent the v4i8 subvector. It optimizes the store + // to: + // + // xtn v0.8b, v0.8h + // str s0, [x0] + + SDValue Undef = DAG.getUNDEF(MVT::i16); + SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, + {Undef, Undef, Undef, Undef}); + + SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, + Value, UndefVec); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); + + Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); + SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Trunc, DAG.getConstant(0, DL, MVT::i64)); + + return DAG.getStore(ST->getChain(), DL, ExtractTrunc, + ST->getBasePtr(), ST->getMemOperand()); +} + +// Custom lowering for any store, vector or scalar and/or default or with +// a truncate operations. Currently only custom lower truncate operation +// from vector v4i16 to v4i8. +SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc Dl(Op); + StoreSDNode *StoreNode = cast(Op); + assert (StoreNode && "Can only custom lower store nodes"); + + SDValue Value = StoreNode->getValue(); + + EVT VT = Value.getValueType(); + EVT MemVT = StoreNode->getMemoryVT(); + + assert (VT.isVector() && "Can only custom lower vector store types"); + + unsigned AS = StoreNode->getAddressSpace(); + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + return scalarizeVectorStore(StoreNode, DAG); + } + + if (StoreNode->isTruncatingStore()) { + return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -2784,6 +2848,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::STORE: + return LowerSTORE(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c8ebf4d93987..8d883c14c2c4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -524,6 +524,8 @@ private: SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 42639edb4a06..d75fef7b0171 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -634,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return LT.first * 2 * AmortizationCost; } - if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && - Ty->getVectorNumElements() < 8) { - // We scalarize the loads/stores because there is not v.4b register and we - // have to promote the elements to v.4h. - unsigned NumVecElts = Ty->getVectorNumElements(); - unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; - // We generate 2 instructions per vector element. - return NumVectorizableInstsToAmortize * NumVecElts * 2; + if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) { + unsigned ProfitableNumElements; + if (Opcode == Instruction::Store) + // We use a custom trunc store lowering so v.4b should be profitable. + ProfitableNumElements = 4; + else + // We scalarize the loads because there is not v.4b register and we + // have to promote the elements to v.2. + ProfitableNumElements = 8; + + if (Ty->getVectorNumElements() < ProfitableNumElements) { + unsigned NumVecElts = Ty->getVectorNumElements(); + unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; + // We generate 2 instructions per vector element. + return NumVectorizableInstsToAmortize * NumVecElts * 2; + } } return LT.first; diff --git a/llvm/test/Analysis/CostModel/AArch64/store.ll b/llvm/test/Analysis/CostModel/AArch64/store.ll index 085863554f00..dfbf79b0bb40 100644 --- a/llvm/test/Analysis/CostModel/AArch64/store.ll +++ b/llvm/test/Analysis/CostModel/AArch64/store.ll @@ -59,7 +59,7 @@ define void @getMemoryOpCost() { ; these types (they get extended to v.4h/v.2s). ; CHECK: cost of 16 {{.*}} store store <2 x i8> undef, <2 x i8> * undef - ; CHECK: cost of 64 {{.*}} store + ; CHECK: cost of 1 {{.*}} store store <4 x i8> undef, <4 x i8> * undef ; CHECK: cost of 16 {{.*}} load load <2 x i8> , <2 x i8> * undef diff --git a/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll index 0d5ebb324ecb..2f2f54fd5a5d 100644 --- a/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll @@ -20,6 +20,16 @@ define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) { ret void } +define void @truncStore.v4i8(<4 x i32> %a, <4 x i8>* %result) { +; CHECK-LABEL: truncStore.v4i8: +; CHECK: xtn [[TMP:(v[0-9]+)]].4h, v{{[0-9]+}}.4s +; CHECK-NEXT: xtn [[TMP2:(v[0-9]+)]].8b, [[TMP]].8h +; CHECK-NEXT: str s{{[0-9]+}}, [x{{[0-9]+}}] + %b = trunc <4 x i32> %a to <4 x i8> + store <4 x i8> %b, <4 x i8>* %result + ret void +} + define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) { ; CHECK-LABEL: truncStore.v8i16: ; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll index 0ebb7a92edae..10405b8cd16d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -15,7 +15,6 @@ target triple = "aarch64--linux-gnu" ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 ; CHECK: vector.body ; CHECK: load i8 -; CHECK: load i8 ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body define void @test(%pair* %p, i64 %n) {