From a50e8d3627eb7f56ebdf0351e18759bb50322c37 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Feb 2016 10:37:49 +0000 Subject: [PATCH] [X86][AVX] Support bit-mask integer shuffles for 256-bit integer vectors AVX1 doesn't support the shuffling of 256-bit integer vectors. For 32/64-bit elements we get around this by shuffling as float/double but for 8/16-bit elements (assuming they can't widen) we currently just split, shuffle as 128-bit vectors and concatenate the results back. This patch adds the ability to lower using the bit-mask patterns before defaulting to the splitting behaviour. In some cases this ends up matching what AVX2 would do anyhow or what AVX1 does on the split vectors. Part 1 of 2 Differential Revision: http://reviews.llvm.org/D17292 llvm-svn: 261081 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++++-- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 4 +--- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 17 ++++------------- llvm/test/CodeGen/X86/vector-zext.ll | 16 +++++----------- 4 files changed, 16 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 37f9df27248c..5d6bdea80731 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11480,9 +11480,13 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget.hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); - if (ElementBits < 32) - // No floating point type available, decompose into 128-bit vectors. + if (ElementBits < 32) { + // No floating point type available, if we can't use the bit operations + // for masking then decompose into 128-bit vectors. + if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return V; return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 87ade38e426b..42924069d9e6 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1754,11 +1754,9 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index aa122d14f131..5828d59b76ab 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -949,19 +949,10 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_ } define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) { -; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: -; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255] -; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: -; AVX2: # BB#0: -; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; ALL: # BB#0: +; ALL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index b41bb8f7b107..d07f1ea62dfe 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -145,11 +145,9 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_8i32: @@ -228,12 +226,10 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: zext_16i8_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_4i64: @@ -390,12 +386,10 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: zext_8i16_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_4i64: