From c1cb733db66ee7c6a11fab0ea89f99c8f55b2286 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Apr 2020 15:00:19 -0700 Subject: [PATCH] [X86] Improve lowering of v16i8->v16i1 truncate under prefer-vector-width=256. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 21 ++++++++++++------- .../CodeGen/X86/prefer-avx256-mask-shuffle.ll | 10 ++++----- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 44552eb706a7..6aa42fba4eb0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20382,17 +20382,22 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors // we need to split into two 8 element vectors which we can extend to v8i32, // truncate and concat the results. There's an additional complication if - // the original type is v16i8. In that case we can't split the v16i8 so - // first we pre-extend it to v16i16 which we can split to v8i16, then extend - // to v8i32, truncate that to v8i1 and concat the two halves. + // the original type is v16i8. In that case we can't split the v16i8 + // directly, so we need to shuffle high elements to low and use + // sign_extend_vector_inreg. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) { + SDValue Lo, Hi; if (InVT == MVT::v16i8) { - // First we need to sign extend up to 256-bits so we can split that. - InVT = MVT::v16i16; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In); + Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In); + Hi = DAG.getVectorShuffle( + InVT, DL, In, In, + {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); + Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi); + } else { + assert(InVT == MVT::v16i16 && "Unexpected VT!"); + Lo = extract128BitVector(In, 0, DAG, DL); + Hi = extract128BitVector(In, 8, DAG, DL); } - SDValue Lo = extract128BitVector(In, 0, DAG, DL); - SDValue Hi = extract128BitVector(In, 8, DAG, DL); // We're split now, just emit two truncates and a concat. The two // truncates will trigger legalization to come back to this function. Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo); diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index bf5ba184fc00..904d0ff0025e 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -133,14 +133,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256VL-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX256VL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 -; AVX256VL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX256VL-NEXT: vptestmd %ymm0, %ymm0, %k3 ; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k3} {z}