From 36750ba5bd0e9e72120dbfaab4166baafd89e98a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 4 Aug 2020 12:35:46 +0100 Subject: [PATCH] [X86][AVX] isHorizontalBinOp - relax lane-crossing limits for AVX1-only targets. Permit lane-crossing post shuffles on AVX1 targets as long as every element comes from the same source lane, which for v8f32/v4f64 cases can be efficiently lowered with the LowerShuffleAsLanePermuteAnd* style methods. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 36 +++++++++++++++++++++---- llvm/test/CodeGen/X86/haddsub-4.ll | 6 ++--- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 79047c90ff99..1f59cd820ad7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10661,6 +10661,35 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); } +/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come +/// from multiple lanes - this is different to isLaneCrossingShuffleMask to +/// better support 'repeated mask + lane permute' style shuffles. +static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, + unsigned ScalarSizeInBits, + ArrayRef Mask) { + assert(LaneSizeInBits && ScalarSizeInBits && + (LaneSizeInBits % ScalarSizeInBits) == 0 && + "Illegal shuffle lane size"); + int NumElts = Mask.size(); + int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits; + int NumLanes = NumElts / NumEltsPerLane; + if (NumLanes > 1) { + for (int i = 0; i != NumLanes; ++i) { + int SrcLane = -1; + for (int j = 0; j != NumEltsPerLane; ++j) { + int M = Mask[(i * NumEltsPerLane) + j]; + if (M < 0) + continue; + int Lane = (M % NumElts) / NumEltsPerLane; + if (SrcLane >= 0 && SrcLane != Lane) + return true; + SrcLane = Lane; + } + } + } + return false; +} + /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same @@ -44598,12 +44627,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, if (IsIdentityPostShuffle) PostShuffleMask.clear(); - // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split), unless - // the shuffle can widen to shuffle entire lanes, which should still be quick. + // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split). if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() && - isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), - PostShuffleMask) && - !canScaleShuffleElements(PostShuffleMask, 2)) + isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask)) return false; // Assume a SingleSource HOP if we only shuffle one input and don't need to diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll index 720b63431a24..baa03d259188 100644 --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -64,11 +64,9 @@ define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) { ; ; AVX1-LABEL: hadd_reverse_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm1[3,1],ymm0[7,5],ymm1[7,5] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v8f32: