AVX-512: Implemented DAG lowering for shuff62x2/shufi62x2 instuctions ( Shuffle Packed Values at 128-bit Granularity )

Tests added , vector-shuffle-512-v8.ll test re-generated.

Differential Revision: http://reviews.llvm.org/D10300

llvm-svn: 239697
This commit is contained in:
Igor Breger 2015-06-14 13:07:47 +00:00
parent ce1ce989e2
commit 5e49697138
3 changed files with 88 additions and 4 deletions

View File

@ -9383,6 +9383,30 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
DAG.getConstant(PermMask, DL, MVT::i8));
}
/// \brief Handle lowering 4-lane 128-bit shuffles.
static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> WidenedMask,
SelectionDAG &DAG) {
assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!");
// form a 128-bit permutation.
// convert the 64-bit shuffle mask selection values into 128-bit selection
// bits defined by a vshuf64x2 instruction's immediate control byte.
unsigned PermMask = 0, Imm = 0;
for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
if(WidenedMask[i] == SM_SentinelZero)
return SDValue();
// use first element in place of undef musk
Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
PermMask |= (Imm % 4) << (i * 2);
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
DAG.getConstant(PermMask, DL, MVT::i8));
}
/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
///
@ -10176,6 +10200,10 @@ static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
SmallVector<int, 4> WidenedMask;
if (canWidenShuffleElements(Mask, WidenedMask))
if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG))
return Op;
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))

View File

@ -116,10 +116,10 @@ define <16 x i32> @test15(<16 x i32> %a) {
ret <16 x i32> %b
}
; CHECK-LABEL: test16
; CHECK: valignq $2, %zmm0, %zmm1
; CHECK: valignq $3, %zmm0, %zmm1
; CHECK: ret
define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
%c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
%c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
ret <8 x double> %c
}
@ -252,6 +252,62 @@ define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind {
ret <8 x double> %c
}
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
; CHECK-LABEL: test_vshuff64x2_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
ret <8 x double> %res
}
define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
; CHECK-LABEL: test_vshuff64x2_512_mask:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
%res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
ret <8 x double> %res
}
define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
; CHECK-LABEL: test_vshufi64x2_512_mask:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1}
; CHECK-NEXT: retq
%y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 4, i32 5>
%res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
ret <8 x i64> %res
}
define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
; CHECK-LABEL: test_vshuff64x2_512_mem:
; CHECK: ## BB#0:
; CHECK-NEXT: vshuff64x2 $40, %zmm0, %zmm0, %zmm0
; CHECK-NEXT: retq
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
%res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 0, i32 1>
ret <8 x double> %res
}
define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind {
; CHECK-LABEL: test_vshuff32x4_512_mem:
; CHECK: ## BB#0:
; CHECK-NEXT: vshuff64x2 $20, %zmm0, %zmm0, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x float>,<16 x float> *%ptr,align 1
%res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <16 x float> %res
}
define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
; CHECK-LABEL: test_align_v16i32_rr:
; CHECK: ## BB#0:

View File

@ -88,7 +88,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_01014545:
; ALL: # BB#0:
; ALL-NEXT: vpermpd $68, %zmm0, %zmm0
; ALL-NEXT: vshuff64x2 $160, %zmm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x double> %shuffle
@ -650,7 +650,7 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_01014545:
; ALL: # BB#0:
; ALL-NEXT: vpermq $68, %zmm0, %zmm0
; ALL-NEXT: vshufi64x2 $160, %zmm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle