From bec6543d175670fe962d144c0aafcc9aa168fcb8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 5 Jul 2016 20:11:29 +0000 Subject: [PATCH] [X86][AVX2] Add support for target shuffle combining to BROADCAST Only support broadcast from vector register so far - memory folding support will have to wait. llvm-svn: 274572 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 26 +++- .../X86/vector-shuffle-combining-avx2.ll | 124 ++++++++++++++++++ .../X86/vector-shuffle-combining-avx512bw.ll | 30 +++++ 3 files changed, 174 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d6a36901520e..4dad0c18cd7b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24717,13 +24717,10 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, return true; } - if (!FloatDomain) - return false; - // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. - if (SrcVT.is128BitVector() && Subtarget.hasSSE3()) { + if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; ShuffleVT = MVT::v2f64; @@ -24741,7 +24738,7 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, } } - if (SrcVT.is256BitVector()) { + if (SrcVT.is256BitVector() && FloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; @@ -24760,7 +24757,7 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, } } - if (SrcVT.is512BitVector()) { + if (SrcVT.is512BitVector() && FloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { @@ -24782,6 +24779,23 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, } } + // Attempt to match against broadcast-from-vector. + if (Subtarget.hasAVX2()) { + for (MVT SVT : + {MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f32, MVT::f64}) { + if (FloatDomain != SVT.isFloatingPoint()) + continue; + + unsigned NumElts = SrcVT.getSizeInBits() / SVT.getSizeInBits(); + SmallVector BroadcastMask(NumElts, 0); + if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { + Shuffle = X86ISD::VBROADCAST; + ShuffleVT = MVT::getVectorVT(SVT, NumElts); + return true; + } + } + } + return false; } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index b6e66c980e9b..8324c5810f91 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -3,6 +3,7 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) +declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) { @@ -59,3 +60,126 @@ define <4 x i64> @combine_permq_pshufb(<4 x i64> %a0) { %4 = bitcast <32 x i8> %3 to <4 x i64> ret <4 x i64> %4 } + +define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) { +; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer) + ret <16 x i8> %1 +} + +define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) { +; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 +; CHECK-NEXT: retq + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> + %2 = bitcast <4 x i64> %1 to <32 x i8> + %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer) + %4 = bitcast <32 x i8> %3 to <8 x i32> + %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer) + %6 = bitcast <8 x i32> %5 to <32 x i8> + ret <32 x i8> %6 +} + +define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) { +; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) { +; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 +; CHECK-NEXT: retq + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> + %2 = bitcast <4 x i64> %1 to <32 x i8> + %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> ) + %4 = bitcast <32 x i8> %3 to <8 x i32> + %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer) + %6 = bitcast <8 x i32> %5 to <32 x i8> + ret <32 x i8> %6 +} + +define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) { +; CHECK-LABEL: combine_pshufb_as_vpbroadcastd128: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 +; CHECK-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> ) + %2 = add <16 x i8> %1, + ret <16 x i8> %2 +} + +define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) { +; CHECK-LABEL: combine_permd_as_vpbroadcastd256: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> + %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer) + %3 = add <8 x i32> %2, + ret <8 x i32> %3 +} + +define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) { +; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %1 +} + +define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) { +; CHECK-LABEL: combine_permd_as_vpbroadcastq256: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> + %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> ) + %3 = add <8 x i32> %2, + ret <8 x i32> %3 +} + +define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) { +; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128: +; CHECK: # BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = bitcast <4 x float> %a to <16 x i8> + %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) + %3 = bitcast <16 x i8> %2 to <4 x float> + ret <4 x float> %3 +} + +define <8 x float> @combine_permd_as_vpbroadcastss256(<4 x float> %a) { +; CHECK-LABEL: combine_permd_as_vpbroadcastss256: +; CHECK: # BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> + %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) + ret <8 x float> %2 +} + +define <4 x double> @combine_permd_as_vpbroadcastsd256(<2 x double> %a) { +; CHECK-LABEL: combine_permd_as_vpbroadcastsd256: +; CHECK: # BB#0: +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: retq + %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> + %2 = bitcast <4 x double> %1 to <8 x float> + %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> ) + %4 = bitcast <8 x float> %3 to <4 x double> + ret <4 x double> %4 +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 4a80663f672b..751ee526c27d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -5,6 +5,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) @@ -369,3 +371,31 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 %m) ret <64 x i8> %res1 } + +define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) { +; CHECK-LABEL: combine_permvar_as_vpbroadcastw512: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1) + ret <32 x i16> %1 +} + +define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) { +; CHECK-LABEL: combine_permvar_as_vpbroadcastd512: +; CHECK: # BB#0: +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1) + ret <16 x i32> %1 +} + +define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) { +; CHECK-LABEL: combine_permvar_as_vpbroadcastq512: +; CHECK: # BB#0: +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1) + ret <8 x i64> %1 +}