From eeeb18cd075ddf7a44c8571f9e17e4b1fcbc8aa4 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 1 Nov 2019 11:28:32 -0700 Subject: [PATCH] [X86] Change the behavior of canWidenShuffleElements used by lowerV2X128Shuffle to match the behavior in lowerVectorShuffle with regards to zeroable elements. Previously we marked zeroable elements in a way that prevented the widening check from recognizing that it could widen. Now we only mark them zeroable if V2 is an all zeros vector. This matches what we do for widening elements in lowerVectorShuffle. Fixes PR43866. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 33 ++++++++++------------ llvm/test/CodeGen/X86/pr43866.ll | 37 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 19 deletions(-) create mode 100644 llvm/test/CodeGen/X86/pr43866.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c7a45f65e989..2862b7aa3b53 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5324,15 +5324,18 @@ static bool canWidenShuffleElements(ArrayRef Mask, static bool canWidenShuffleElements(ArrayRef Mask, const APInt &Zeroable, + bool V2IsZero, SmallVectorImpl &WidenedMask) { - SmallVector TargetMask(Mask.begin(), Mask.end()); - for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { - if (TargetMask[i] == SM_SentinelUndef) - continue; - if (Zeroable[i]) - TargetMask[i] = SM_SentinelZero; + // Create an alternative mask with info about zeroable elements. + // Here we do not set undef elements as zeroable. + SmallVector ZeroableMask(Mask.begin(), Mask.end()); + if (V2IsZero) { + assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); + for (int i = 0, Size = Mask.size(); i != Size; ++i) + if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + ZeroableMask[i] = SM_SentinelZero; } - return canWidenShuffleElements(TargetMask, WidenedMask); + return canWidenShuffleElements(ZeroableMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef Mask) { @@ -14817,8 +14820,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); + bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector WidenedMask; - if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask)) + if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask)) return SDValue(); bool IsLowZero = (Zeroable & 0x3) == 0x3; @@ -17095,23 +17100,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); - // Create an alternative mask with info about zeroable elements. - // Here we do not set undef elements as zeroable. - SmallVector ZeroableMask(OrigMask.begin(), OrigMask.end()); - if (V2IsZero) { - assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); - for (int i = 0; i != NumElements; ++i) - if (OrigMask[i] != SM_SentinelUndef && Zeroable[i]) - ZeroableMask[i] = SM_SentinelZero; - } - // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && - canWidenShuffleElements(ZeroableMask, WidenedMask)) { + canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this diff --git a/llvm/test/CodeGen/X86/pr43866.ll b/llvm/test/CodeGen/X86/pr43866.ll new file mode 100644 index 000000000000..a430975c47d4 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr43866.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s + +@v2_0 = global <2 x i32> zeroinitializer, align 8 + +define void @test() { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-32, %rsp +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4] +; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %v8_0 = alloca <8 x i32>, align 32 + %v8_0.0.v8_0.0..sroa_cast = bitcast <8 x i32>* %v8_0 to i8* + %0 = load <2 x i32>, <2 x i32>* @v2_0, align 8 + %shuffle = shufflevector <2 x i32> %0, <2 x i32> , <8 x i32> + store volatile <8 x i32> %shuffle, <8 x i32>* %v8_0, align 32 + ret void +}