[X86][SSE] Add target shuffle support to X86TargetLowering::computeKnownBitsForTargetNode

Ideally we'd use resolveTargetShuffleInputs to handle faux shuffles as well but:
(a) that code path doesn't handle general/pre-legalized ops/types very well.
(b) I'm concerned about the compute time as they recurse to calls to computeKnownBits/ComputeNumSignBits which would need depth limiting somehow.

llvm-svn: 334007
This commit is contained in:
Simon Pilgrim 2018-06-05 10:52:29 +00:00
parent df4ca0eeda
commit fef9b6eea6
3 changed files with 59 additions and 17 deletions

View File

@ -28795,6 +28795,57 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.Zero.setBitsFrom(8);
break;
}
// Handle target shuffles.
// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
if (isTargetShuffle(Opc)) {
bool IsUnary;
SmallVector<int, 64> Mask;
SmallVector<SDValue, 2> Ops;
if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
IsUnary)) {
unsigned NumOps = Ops.size();
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
Known.Zero.setAllBits(); Known.One.setAllBits();
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
int M = Mask[i];
if (M == SM_SentinelUndef) {
// For UNDEF elements, we don't know anything about the common state
// of the shuffle result.
Known.resetAll();
break;
} else if (M == SM_SentinelZero) {
Known.One.clearAllBits();
continue;
}
assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range");
unsigned OpIdx = (unsigned)M / NumElts;
unsigned EltIdx = (unsigned)M % NumElts;
if (Ops[OpIdx].getValueType() != VT) {
// TODO - handle target shuffle ops with different value types.
Known.resetAll();
break;
}
DemandedOps[OpIdx].setBit(EltIdx);
}
// Known bits are the values that are shared by every demanded element.
for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
if (!DemandedOps[i])
continue;
KnownBits Known2;
DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
}
}
}
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

View File

@ -59,8 +59,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
; X64-SKYLAKE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; X64-SKYLAKE-NEXT: vmovd %xmm0, %eax
; X64-SKYLAKE-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; X64-SKYLAKE-NEXT: movl %eax, (%rdi)

View File

@ -1620,15 +1620,10 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: packusdw %xmm3, %xmm7
; SSE41-NEXT: packusdw %xmm4, %xmm7
; SSE41-NEXT: psubusw %xmm7, %xmm10
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
; SSE41-NEXT: movdqa %xmm10, %xmm1
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[12,13],zero,zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero,zero,zero,zero
; SSE41-NEXT: movdqa %xmm10, %xmm2
; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8,9],zero,zero,zero,zero,zero,zero,xmm2[10,11],zero,zero,zero,zero,zero,zero
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: pshufb {{.*#+}} xmm10 = xmm10[4,5],zero,zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
; SSE41-NEXT: packusdw %xmm10, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: psubus_8i64_max:
@ -1656,13 +1651,10 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[12,13],zero,zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;