From fef9b6eea63b095ac32526342d45d726f6704050 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 5 Jun 2018 10:52:29 +0000 Subject: [PATCH] [X86][SSE] Add target shuffle support to X86TargetLowering::computeKnownBitsForTargetNode Ideally we'd use resolveTargetShuffleInputs to handle faux shuffles as well but: (a) that code path doesn't handle general/pre-legalized ops/types very well. (b) I'm concerned about the compute time as they recurse to calls to computeKnownBits/ComputeNumSignBits which would need depth limiting somehow. llvm-svn: 334007 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 51 +++++++++++++++++++++++++ llvm/test/CodeGen/X86/pr35918.ll | 3 +- llvm/test/CodeGen/X86/psubus.ll | 22 ++++------- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 339ab0b48fdf..0d20ed93957a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28795,6 +28795,57 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero.setBitsFrom(8); break; } + + // Handle target shuffles. + // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. + if (isTargetShuffle(Opc)) { + bool IsUnary; + SmallVector Mask; + SmallVector Ops; + if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask, + IsUnary)) { + unsigned NumOps = Ops.size(); + unsigned NumElts = VT.getVectorNumElements(); + if (Mask.size() == NumElts) { + SmallVector DemandedOps(NumOps, APInt(NumElts, 0)); + Known.Zero.setAllBits(); Known.One.setAllBits(); + for (unsigned i = 0; i != NumElts; ++i) { + if (!DemandedElts[i]) + continue; + int M = Mask[i]; + if (M == SM_SentinelUndef) { + // For UNDEF elements, we don't know anything about the common state + // of the shuffle result. + Known.resetAll(); + break; + } else if (M == SM_SentinelZero) { + Known.One.clearAllBits(); + continue; + } + assert(0 <= M && (unsigned)M < (NumOps * NumElts) && + "Shuffle index out of range"); + + unsigned OpIdx = (unsigned)M / NumElts; + unsigned EltIdx = (unsigned)M % NumElts; + if (Ops[OpIdx].getValueType() != VT) { + // TODO - handle target shuffle ops with different value types. + Known.resetAll(); + break; + } + DemandedOps[OpIdx].setBit(EltIdx); + } + // Known bits are the values that are shared by every demanded element. + for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) { + if (!DemandedOps[i]) + continue; + KnownBits Known2; + DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + } + } + } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( diff --git a/llvm/test/CodeGen/X86/pr35918.ll b/llvm/test/CodeGen/X86/pr35918.ll index 469fe1403852..10d39ca9196b 100644 --- a/llvm/test/CodeGen/X86/pr35918.ll +++ b/llvm/test/CodeGen/X86/pr35918.ll @@ -59,8 +59,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], ; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; X64-SKYLAKE-NEXT: vmovd %xmm0, %eax ; X64-SKYLAKE-NEXT: orl $-16777216, %eax # imm = 0xFF000000 ; X64-SKYLAKE-NEXT: movl %eax, (%rdi) diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 7b375b494a60..295a1482a140 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1620,15 +1620,10 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: packusdw %xmm3, %xmm7 ; SSE41-NEXT: packusdw %xmm4, %xmm7 ; SSE41-NEXT: psubusw %xmm7, %xmm10 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm10, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[12,13],zero,zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm10, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8,9],zero,zero,zero,zero,zero,zero,xmm2[10,11],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: packusdw %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm10 = xmm10[4,5],zero,zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; SSE41-NEXT: packusdw %xmm10, %xmm0 -; SSE41-NEXT: packusdw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_8i64_max: @@ -1656,13 +1651,10 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[12,13],zero,zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ;