forked from OSchip/llvm-project
[X86, AVX] recognize shufflevector with zero input as a vperm2 (PR22984)
vperm2x128 instructions have the special ability (aka free hardware capability) to shuffle zero values into a vector. This patch recognizes that type of shuffle and generates the appropriate control byte. https://llvm.org/bugs/show_bug.cgi?id=22984 Differential Revision: http://reviews.llvm.org/D8563 llvm-svn: 233100
This commit is contained in:
parent
a98fac28aa
commit
99d246d7d7
|
@ -9055,33 +9055,48 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
|
|||
SDValue V2, ArrayRef<int> Mask,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
// TODO: If minimizing size and one of the inputs is a zero vector and the
|
||||
// the zero vector has only one use, we could use a VPERM2X128 to save the
|
||||
// instruction bytes needed to explicitly generate the zero vector.
|
||||
|
||||
// Blends are faster and handle all the non-lane-crossing cases.
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
|
||||
VT.getVectorNumElements() / 2);
|
||||
bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
|
||||
bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
|
||||
|
||||
// If either input operand is a zero vector, use VPERM2X128 because its mask
|
||||
// allows us to replace the zero input with an implicit zero.
|
||||
if (!IsV1Zero && !IsV2Zero) {
|
||||
// Check for patterns which can be matched with a single insert of a 128-bit
|
||||
// subvector.
|
||||
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
|
||||
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
|
||||
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
|
||||
VT.getVectorNumElements() / 2);
|
||||
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
|
||||
DAG.getIntPtrConstant(0));
|
||||
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
|
||||
OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0));
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
|
||||
}
|
||||
if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) {
|
||||
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
|
||||
DAG.getIntPtrConstant(0));
|
||||
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
|
||||
DAG.getIntPtrConstant(2));
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
|
||||
}
|
||||
|
||||
// Otherwise form a 128-bit permutation.
|
||||
// FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
|
||||
// Otherwise form a 128-bit permutation. After accounting for undefs,
|
||||
// convert the 64-bit shuffle mask selection values into 128-bit
|
||||
// selection bits by dividing the indexes by 2 and shifting into positions
|
||||
// defined by a vperm2*128 instruction's immediate control byte.
|
||||
|
||||
// The immediate permute control byte looks like this:
|
||||
// [1:0] - select 128 bits from sources for low half of destination
|
||||
// [2] - ignore
|
||||
// [3] - zero low half of destination
|
||||
// [5:4] - select 128 bits from sources for high half of destination
|
||||
// [6] - ignore
|
||||
// [7] - zero high half of destination
|
||||
|
||||
int MaskLO = Mask[0];
|
||||
if (MaskLO == SM_SentinelUndef)
|
||||
MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
|
||||
|
@ -9091,6 +9106,27 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
|
|||
MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
|
||||
|
||||
unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
|
||||
|
||||
// If either input is a zero vector, replace it with an undef input.
|
||||
// Shuffle mask values < 4 are selecting elements of V1.
|
||||
// Shuffle mask values >= 4 are selecting elements of V2.
|
||||
// Adjust each half of the permute mask by clearing the half that was
|
||||
// selecting the zero vector and setting the zero mask bit.
|
||||
if (IsV1Zero) {
|
||||
V1 = DAG.getUNDEF(VT);
|
||||
if (MaskLO < 4)
|
||||
PermMask = (PermMask & 0xf0) | 0x08;
|
||||
if (MaskHI < 4)
|
||||
PermMask = (PermMask & 0x0f) | 0x80;
|
||||
}
|
||||
if (IsV2Zero) {
|
||||
V2 = DAG.getUNDEF(VT);
|
||||
if (MaskLO >= 4)
|
||||
PermMask = (PermMask & 0xf0) | 0x08;
|
||||
if (MaskHI >= 4)
|
||||
PermMask = (PermMask & 0x0f) | 0x80;
|
||||
}
|
||||
|
||||
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
|
||||
DAG.getConstant(PermMask, MVT::i8));
|
||||
}
|
||||
|
|
|
@ -261,3 +261,94 @@ entry:
|
|||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
|
||||
ret <8 x float> %shuffle
|
||||
}
|
||||
|
||||
;; Test zero mask generation.
|
||||
;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
|
||||
;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
|
||||
|
||||
define <4 x double> @vperm2z_0x08(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x08:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
define <4 x double> @vperm2z_0x18(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x18:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
define <4 x double> @vperm2z_0x28(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x28:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
define <4 x double> @vperm2z_0x38(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x38:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
|
||||
; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
define <4 x double> @vperm2z_0x80(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x80:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
define <4 x double> @vperm2z_0x81(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x81:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
define <4 x double> @vperm2z_0x82(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x82:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
define <4 x double> @vperm2z_0x83(<4 x double> %a) {
|
||||
; ALL-LABEL: vperm2z_0x83:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0
|
||||
; ALL-NEXT: retq
|
||||
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
|
||||
ret <4 x double> %s
|
||||
}
|
||||
|
||||
;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
|
||||
|
||||
define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
|
||||
; ALL-LABEL: vperm2z_int_0x83:
|
||||
; ALL: # BB#0:
|
||||
; AVX1: vperm2f128 $129, %ymm0, %ymm0, %ymm0
|
||||
; AVX2: vperm2i128 $129, %ymm0, %ymm0, %ymm0
|
||||
%s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
|
||||
%c = add <4 x i64> %b, %s
|
||||
ret <4 x i64> %c
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue