From 04e79329d0904400398f87b1b52734e271588ed6 Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Thu, 8 Oct 2015 08:13:02 +0000 Subject: [PATCH] [X86] Fix wrong treatment of multi-lane blends in BUILD_VECTORtoBlendMask() This fixes two separate bugs: 1) The mask for the high lane was not set correctly. That fixes PR24532. 2) The transformation should bail out if it believes it involves more than 2 lanes, as it does not currently do anything sensible in this case. Differential Revision: http://reviews.llvm.org/D13505 llvm-svn: 249669 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 14 ++++- llvm/test/CodeGen/X86/vector-blend.ll | 72 +++++++++++++------------ 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c9c57e435630..60b25994865e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11085,8 +11085,13 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned &MaskValue) { MaskValue = 0; unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + // We don't handle the >2 lanes case right now. unsigned NumLanes = (NumElems - 1) / 8 + 1; + if (NumLanes > 2) + return false; + unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symmetric for the both lanes. @@ -11101,16 +11106,21 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, if (isa(SndLaneEltCond)) Lane2Cond = !isZero(SndLaneEltCond); + unsigned LaneMask = 0; if (Lane1Cond == Lane2Cond || Lane2Cond < 0) // Lane1Cond != 0, means we want the first argument. // Lane1Cond == 0, means we want the second argument. // The encoding of this argument is 0 for the first argument, 1 // for the second. Therefore, invert the condition. - MaskValue |= !Lane1Cond << i; + LaneMask = !Lane1Cond << i; else if (Lane1Cond < 0) - MaskValue |= !Lane2Cond << i; + LaneMask = !Lane2Cond << i; else return false; + + MaskValue |= LaneMask; + if (NumLanes == 2) + MaskValue |= LaneMask << NumElemsInLane; } return true; } diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index e15daaa54a33..aaf81f2f9bb6 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -255,31 +255,32 @@ entry: define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ; SSE2-LABEL: vsel_i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: andnps %xmm0, %xmm2 +; SSE2-NEXT: orps %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_i8: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8,9,10,11,12,13,14,15] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,xmm1[9,10,11],zero,xmm1[13,14,15] ; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_i8: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_i8: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: %vsel = select <16 x i1> , <16 x i8> %v1, <16 x i8> %v2 @@ -623,49 +624,52 @@ entry: define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { ; SSE2-LABEL: constant_pblendvb_avx2: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movaps {{.*#+}} xmm4 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] ; SSE2-NEXT: movaps %xmm4, %xmm5 -; SSE2-NEXT: andnps %xmm2, %xmm5 -; SSE2-NEXT: andps %xmm4, %xmm0 -; SSE2-NEXT: orps %xmm5, %xmm0 -; SSE2-NEXT: andps %xmm4, %xmm1 -; SSE2-NEXT: andnps %xmm3, %xmm4 -; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: andnps %xmm0, %xmm5 +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: orps %xmm2, %xmm5 +; SSE2-NEXT: andps %xmm4, %xmm3 +; SSE2-NEXT: andnps %xmm1, %xmm4 +; SSE2-NEXT: orps %xmm3, %xmm4 +; SSE2-NEXT: movaps %xmm5, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: constant_pblendvb_avx2: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,128,3,128,128,128,7,128,128,128,128,128,128,128,128] -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [128,128,2,128,4,5,6,128,8,9,10,11,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm5, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [128,128,2,128,4,5,6,128,128,128,10,128,12,13,14,128] +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,1,128,3,128,128,128,7,8,9,128,11,128,128,128,15] +; SSSE3-NEXT: pshufb %xmm5, %xmm2 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: pshufb %xmm5, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: pshufb %xmm5, %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: constant_pblendvb_avx2: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: pblendvb %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; SSE41-NEXT: pblendvb %xmm2, %xmm4 +; SSE41-NEXT: pblendvb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_pblendvb_avx2: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa .LCPI18_0(%rip), %xmm4 # xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_pblendvb_avx2: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq entry: