From b4bd0a404fe26071dab0854dfd9767974909c7c4 Mon Sep 17 00:00:00 2001 From: Sebastian Pop Date: Fri, 9 Mar 2018 14:29:21 +0000 Subject: [PATCH] [x86][aarch64] ask the backend whether it has a vector blend instruction The code to match and produce more x86 vector blends was enabled for all architectures even though the transform may pessimize the code for other architectures that do not provide a vector blend instruction. Added an aarch64 testcase to check that a VZIP instruction is generated instead of byte movs. Differential Revision: https://reviews.llvm.org/D44118 llvm-svn: 327132 --- llvm/include/llvm/CodeGen/TargetLowering.h | 3 ++ .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 50 ++++++++++--------- llvm/lib/Target/X86/X86ISelLowering.h | 2 + llvm/test/CodeGen/AArch64/aarch64-vuzp.ll | 10 ++++ .../test/CodeGen/AArch64/arm64-collect-loh.ll | 4 +- 5 files changed, 43 insertions(+), 26 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 7e03f42445b8..29b0384be8c0 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2117,6 +2117,9 @@ public: return false; } + /// Return true if the target has a vector blend instruction. + virtual bool hasVectorBlend() const { return false; } + /// \brief Get the maximum supported factor for interleaved memory accesses. /// Default to be the minimum interleave factor: 2. virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9d0ea3857928..aac79075e6e5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1557,33 +1557,35 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, if (N1.isUndef()) commuteShuffle(N1, N2, MaskVec); - // If shuffling a splat, try to blend the splat instead. We do this here so - // that even when this arises during lowering we don't have to re-handle it. - auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) { - BitVector UndefElements; - SDValue Splat = BV->getSplatValue(&UndefElements); - if (!Splat) - return; + if (TLI->hasVectorBlend()) { + // If shuffling a splat, try to blend the splat instead. We do this here so + // that even when this arises during lowering we don't have to re-handle it. + auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) { + BitVector UndefElements; + SDValue Splat = BV->getSplatValue(&UndefElements); + if (!Splat) + return; - for (int i = 0; i < NElts; ++i) { - if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts)) - continue; + for (int i = 0; i < NElts; ++i) { + if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts)) + continue; - // If this input comes from undef, mark it as such. - if (UndefElements[MaskVec[i] - Offset]) { - MaskVec[i] = -1; - continue; + // If this input comes from undef, mark it as such. + if (UndefElements[MaskVec[i] - Offset]) { + MaskVec[i] = -1; + continue; + } + + // If we can blend a non-undef lane, use that instead. + if (!UndefElements[i]) + MaskVec[i] = i + Offset; } - - // If we can blend a non-undef lane, use that instead. - if (!UndefElements[i]) - MaskVec[i] = i + Offset; - } - }; - if (auto *N1BV = dyn_cast(N1)) - BlendSplat(N1BV, 0); - if (auto *N2BV = dyn_cast(N2)) - BlendSplat(N2BV, NElts); + }; + if (auto *N1BV = dyn_cast(N1)) + BlendSplat(N1BV, 0); + if (auto *N2BV = dyn_cast(N2)) + BlendSplat(N2BV, NElts); + } // Canonicalize all index into lhs, -> shuffle lhs, undef // Canonicalize all index into rhs, -> shuffle rhs, undef diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index df82c5a5b82b..6181dbd2a6ab 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1098,6 +1098,8 @@ namespace llvm { StringRef getStackProbeSymbolName(MachineFunction &MF) const override; + bool hasVectorBlend() const override { return true; } + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } /// \brief Lower interleaved load(s) into target specific diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll index cc91873a75bb..a7b20f25557c 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll @@ -58,3 +58,13 @@ entry: store <4 x i32> %y, <4 x i32>* %z, align 4 ret void } + +; Check that this pattern is recognized as a VZIP and +; that the vector blend transform does not scramble the pattern. +; CHECK-LABEL: vzipNoBlend: +; CHECK: zip1 +define <8 x i8> @vzipNoBlend(<8 x i8>* %A, <8 x i16>* %B) nounwind { + %t = load <8 x i8>, <8 x i8>* %A + %vzip = shufflevector <8 x i8> %t, <8 x i8> , <8 x i32> + ret <8 x i8> %vzip +} diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll index c7ba989d933e..dc34ee243b2a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -638,13 +638,13 @@ define void @setL(<1 x i8> %t) { ; CHECK: [[LOH_LABEL1:Lloh[0-9]+]]: ; CHECK: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF] ; The tuple comes from the next instruction. -; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]] +; CHECK: ext.16b v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, #1 ; CHECK: ret ; CHECK: .loh AdrpLdr [[LOH_LABEL0]], [[LOH_LABEL1]] define void @uninterestingSub(i8* nocapture %row) #0 { %tmp = bitcast i8* %row to <16 x i8>* %tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16 - %vext43 = shufflevector <16 x i8> , <16 x i8> %tmp1, <16 x i32> + %vext43 = shufflevector <16 x i8> , <16 x i8> %tmp1, <16 x i32> %add.i.414 = add <16 x i8> zeroinitializer, %vext43 store <16 x i8> %add.i.414, <16 x i8>* %tmp, align 16 %add.ptr51 = getelementptr inbounds i8, i8* %row, i64 16