forked from OSchip/llvm-project
[X86] lowerShuffleWithVPMOV - support direct lowering to VPMOV on VLX targets
lowerShuffleWithVPMOV currently only matches shuffle(truncate(x)) patterns, but on VLX targets the truncate isn't usually necessary to make the VPMOV node worthwhile (as we're only targetting v16i8/v8i16 shuffles we're almost always ending up with a PSHUFB node instead). PACKSS/PACKUS are still preferred vs VPMOV due to their lower uop count. Fixes the remaining regression from the fixes in rG293899c64b75
This commit is contained in:
parent
dd4c838da3
commit
6ba5fc2dee
|
@ -12406,22 +12406,33 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
|
|||
unsigned EltSizeInBits = VT.getScalarSizeInBits();
|
||||
unsigned MaxScale = 64 / EltSizeInBits;
|
||||
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
|
||||
unsigned SrcEltBits = EltSizeInBits * Scale;
|
||||
unsigned NumSrcElts = NumElts / Scale;
|
||||
unsigned UpperElts = NumElts - NumSrcElts;
|
||||
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
|
||||
!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
|
||||
continue;
|
||||
|
||||
// Attempt to find a matching source truncation, but as a fall back VLX
|
||||
// cases can use the VPMOV directly.
|
||||
SDValue Src = peekThroughBitcasts(V1);
|
||||
if (Src.getOpcode() != ISD::TRUNCATE ||
|
||||
Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
|
||||
if (Src.getOpcode() == ISD::TRUNCATE &&
|
||||
Src.getScalarValueSizeInBits() == SrcEltBits) {
|
||||
Src = Src.getOperand(0);
|
||||
} else if (Subtarget.hasVLX()) {
|
||||
MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
|
||||
MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
|
||||
Src = DAG.getBitcast(SrcVT, Src);
|
||||
// Don't do this if PACKSS/PACKUS could perform it cheaper.
|
||||
if (Scale == 2 &&
|
||||
((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
|
||||
(DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
|
||||
return SDValue();
|
||||
} else
|
||||
return SDValue();
|
||||
Src = Src.getOperand(0);
|
||||
|
||||
// VPMOVWB is only available with avx512bw.
|
||||
MVT SrcVT = Src.getSimpleValueType();
|
||||
if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
|
||||
!Subtarget.hasBWI())
|
||||
if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
|
||||
return SDValue();
|
||||
|
||||
bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
|
||||
|
|
|
@ -187,7 +187,7 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
|
|||
;
|
||||
; SKX-LABEL: trunc_qw_128:
|
||||
; SKX: ## %bb.0:
|
||||
; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
|
||||
; SKX-NEXT: vpmovqw %xmm0, %xmm0
|
||||
; SKX-NEXT: retq
|
||||
%x = trunc <2 x i64> %i to <2 x i16>
|
||||
ret <2 x i16> %x
|
||||
|
|
|
@ -739,8 +739,7 @@ define <2 x half> @test_s8tofp2(<2 x i8> %arg0) {
|
|||
define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
|
||||
; CHECK-LABEL: test_u1tofp2:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vpmovqw %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
|
||||
; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
|
|
@ -73,10 +73,9 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
|
|||
; AVX512-LABEL: vf4:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vpmovdw %xmm0, %xmm1
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vmovq %xmm1, (%rsi)
|
||||
; AVX512-NEXT: vmovq %xmm0, (%rdx)
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vpmovdw %xmm0, (%rsi)
|
||||
; AVX512-NEXT: vmovq %xmm1, (%rdx)
|
||||
; AVX512-NEXT: retq
|
||||
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
|
||||
|
||||
|
|
|
@ -42,16 +42,15 @@ define void @vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
|
|||
; AVX512-LABEL: vf2:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vpmovqw %xmm0, %xmm1
|
||||
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
|
||||
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
|
||||
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
|
||||
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
|
||||
; AVX512-NEXT: vmovd %xmm1, (%rsi)
|
||||
; AVX512-NEXT: vmovd %xmm2, (%rdx)
|
||||
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
|
||||
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
|
||||
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
||||
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
|
||||
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
|
||||
; AVX512-NEXT: vpmovqw %xmm0, (%rsi)
|
||||
; AVX512-NEXT: vmovd %xmm1, (%rdx)
|
||||
; AVX512-NEXT: vmovd %xmm3, (%rcx)
|
||||
; AVX512-NEXT: vmovd %xmm0, (%r8)
|
||||
; AVX512-NEXT: vmovd %xmm2, (%r8)
|
||||
; AVX512-NEXT: retq
|
||||
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
|
||||
|
||||
|
|
|
@ -1935,13 +1935,21 @@ define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
|
|||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: rot16_trunc:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1
|
||||
; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX512-NEXT: retq
|
||||
; AVX512NOVLX-LABEL: rot16_trunc:
|
||||
; AVX512NOVLX: # %bb.0:
|
||||
; AVX512NOVLX-NEXT: vpsrld $11, %xmm0, %xmm1
|
||||
; AVX512NOVLX-NEXT: vpslld $5, %xmm0, %xmm0
|
||||
; AVX512NOVLX-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; AVX512NOVLX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX512NOVLX-NEXT: retq
|
||||
;
|
||||
; AVX512VLX-LABEL: rot16_trunc:
|
||||
; AVX512VLX: # %bb.0:
|
||||
; AVX512VLX-NEXT: vpsrld $11, %xmm0, %xmm1
|
||||
; AVX512VLX-NEXT: vpslld $5, %xmm0, %xmm0
|
||||
; AVX512VLX-NEXT: vpor %xmm0, %xmm1, %xmm0
|
||||
; AVX512VLX-NEXT: vpmovdw %xmm0, %xmm0
|
||||
; AVX512VLX-NEXT: retq
|
||||
;
|
||||
; XOP-LABEL: rot16_trunc:
|
||||
; XOP: # %bb.0:
|
||||
|
|
Loading…
Reference in New Issue