forked from OSchip/llvm-project
[AVX-512] Teach shuffle lowering to use vinsert instructions for shuffles corresponding to 256-bit subvector inserts.
llvm-svn: 290870
This commit is contained in:
parent
fa875a1d3d
commit
9496e3f916
|
@ -12590,6 +12590,40 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
}
|
||||
}
|
||||
|
||||
/// \brief Try to lower a vector shuffle as a 256-bit shuffle.
|
||||
static SDValue lowerV2X256VectorShuffle(const SDLoc &DL, MVT VT,
|
||||
ArrayRef<int> Mask, SDValue V1,
|
||||
SDValue V2, SelectionDAG &DAG) {
|
||||
assert(VT.getScalarSizeInBits() == 64 &&
|
||||
"Unexpected element type size for 128bit shuffle.");
|
||||
|
||||
// To handle 256 bit vector requires VLX and most probably
|
||||
// function lowerV2X128VectorShuffle() is better solution.
|
||||
assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
|
||||
|
||||
assert(Mask.size() == 4 && "Expect mask to already be widened to 128-bits.");
|
||||
|
||||
SmallVector<int, 2> WidenedMask;
|
||||
if (!canWidenShuffleElements(Mask, WidenedMask))
|
||||
return SDValue();
|
||||
|
||||
// Check for patterns which can be matched with a single insert of a 256-bit
|
||||
// subvector.
|
||||
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, WidenedMask, {0, 0});
|
||||
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, WidenedMask, {0, 2})) {
|
||||
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
|
||||
VT.getVectorNumElements() / 2);
|
||||
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
|
||||
OnlyUsesV1 ? V1 : V2,
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
|
||||
static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
|
||||
ArrayRef<int> Mask, SDValue V1,
|
||||
|
@ -12605,6 +12639,11 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
|
|||
if (!canWidenShuffleElements(Mask, WidenedMask))
|
||||
return SDValue();
|
||||
|
||||
// See if we can widen even further to a 256-bit element.
|
||||
if (SDValue Shuf256 = lowerV2X256VectorShuffle(DL, VT, WidenedMask, V1, V2,
|
||||
DAG))
|
||||
return Shuf256;
|
||||
|
||||
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
|
||||
// Insure elements came from the same Op.
|
||||
int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
|
||||
|
|
|
@ -3020,11 +3020,10 @@ declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x do
|
|||
define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
||||
; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[0,1,2,3],zmm1[0,1,2,3]
|
||||
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3],zmm1[0,1,2,3]
|
||||
; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,2,3]
|
||||
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
|
||||
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
|
||||
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
@ -3041,11 +3040,10 @@ declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i3
|
|||
define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
||||
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[0,1,2,3],zmm1[0,1,2,3]
|
||||
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3],zmm1[0,1,2,3]
|
||||
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,2,3]
|
||||
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
|
||||
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
|
||||
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
|
|
@ -30,9 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) {
|
|||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
|
||||
; CHECK-NEXT: vpmovm2q %k0, %zmm0
|
||||
; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
|
||||
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vpmovm2q %k0, %zmm1
|
||||
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vpmovq2m %zmm0, %k0
|
||||
; CHECK-NEXT: vpmovm2w %k0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
|
|
@ -58,11 +58,10 @@ declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x fl
|
|||
define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
||||
; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm3 = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
|
||||
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
|
||||
; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
@ -102,11 +101,10 @@ declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>,
|
|||
define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
|
||||
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm3 = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
|
||||
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
|
||||
; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue