forked from OSchip/llvm-project
[X86][SSE] Utilize ZeroableElements for canWidenShuffleElements
canWidenShuffleElements can do a better job if given a mask with ZeroableElements info. Apparently, ZeroableElements was being only used to identify AllZero candidates, but possibly we could plug it into more shuffle matchers. Original Patch by Zvi Rackover @zvi Differential Revision: https://reviews.llvm.org/D42044 llvm-svn: 336903
This commit is contained in:
parent
9b00a8e9d7
commit
44b89fa900
|
@ -15048,20 +15048,49 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
|
||||||
if (Zeroable.isAllOnesValue())
|
if (Zeroable.isAllOnesValue())
|
||||||
return getZeroVector(VT, Subtarget, DAG, DL);
|
return getZeroVector(VT, Subtarget, DAG, DL);
|
||||||
|
|
||||||
|
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
|
||||||
|
|
||||||
|
// Create an alternative mask with info about zeroable elements.
|
||||||
|
// Here we do not set undef elements as zeroable.
|
||||||
|
SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
|
||||||
|
if (V2IsZero) {
|
||||||
|
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
|
||||||
|
for (int i = 0; i != NumElements; ++i)
|
||||||
|
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
|
||||||
|
ZeroableMask[i] = SM_SentinelZero;
|
||||||
|
}
|
||||||
|
|
||||||
// Try to collapse shuffles into using a vector type with fewer elements but
|
// Try to collapse shuffles into using a vector type with fewer elements but
|
||||||
// wider element types. We cap this to not form integers or floating point
|
// wider element types. We cap this to not form integers or floating point
|
||||||
// elements wider than 64 bits, but it might be interesting to form i128
|
// elements wider than 64 bits, but it might be interesting to form i128
|
||||||
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
|
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
|
||||||
SmallVector<int, 16> WidenedMask;
|
SmallVector<int, 16> WidenedMask;
|
||||||
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
|
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
|
||||||
canWidenShuffleElements(Mask, WidenedMask)) {
|
canWidenShuffleElements(ZeroableMask, WidenedMask)) {
|
||||||
MVT NewEltVT = VT.isFloatingPoint()
|
MVT NewEltVT = VT.isFloatingPoint()
|
||||||
? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
|
? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
|
||||||
: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
|
: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
|
||||||
MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
|
int NewNumElts = NumElements / 2;
|
||||||
|
MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
|
||||||
// Make sure that the new vector type is legal. For example, v2f64 isn't
|
// Make sure that the new vector type is legal. For example, v2f64 isn't
|
||||||
// legal on SSE1.
|
// legal on SSE1.
|
||||||
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
|
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
|
||||||
|
if (V2IsZero) {
|
||||||
|
// Modify the new Mask to take all zeros from the all-zero vector.
|
||||||
|
// Choose indices that are blend-friendly.
|
||||||
|
bool UsedZeroVector = false;
|
||||||
|
assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
|
||||||
|
"V2's non-undef elements are used?!");
|
||||||
|
for (int i = 0; i != NewNumElts; ++i)
|
||||||
|
if (WidenedMask[i] == SM_SentinelZero) {
|
||||||
|
WidenedMask[i] = i + NewNumElts;
|
||||||
|
UsedZeroVector = true;
|
||||||
|
}
|
||||||
|
// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
|
||||||
|
// some elements to be undef.
|
||||||
|
if (UsedZeroVector)
|
||||||
|
V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
|
||||||
|
}
|
||||||
V1 = DAG.getBitcast(NewVT, V1);
|
V1 = DAG.getBitcast(NewVT, V1);
|
||||||
V2 = DAG.getBitcast(NewVT, V2);
|
V2 = DAG.getBitcast(NewVT, V2);
|
||||||
return DAG.getBitcast(
|
return DAG.getBitcast(
|
||||||
|
|
|
@ -9,9 +9,7 @@
|
||||||
define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
|
define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
|
||||||
; AVX-LABEL: castA:
|
; AVX-LABEL: castA:
|
||||||
; AVX: ## %bb.0:
|
; AVX: ## %bb.0:
|
||||||
; AVX-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0
|
; AVX-NEXT: vmovaps %xmm0, %xmm0
|
||||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
||||||
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
%shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
|
%shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
|
||||||
ret <8 x float> %shuffle.i
|
ret <8 x float> %shuffle.i
|
||||||
|
|
|
@ -2890,31 +2890,15 @@ define <32 x i8> @zeroable_src_to_zext(<32 x i8> %a0) {
|
||||||
; AVX1-LABEL: zeroable_src_to_zext:
|
; AVX1-LABEL: zeroable_src_to_zext:
|
||||||
; AVX1: # %bb.0:
|
; AVX1: # %bb.0:
|
||||||
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
|
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
|
||||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||||
|
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
|
||||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||||
; AVX1-NEXT: retq
|
; AVX1-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX2-LABEL: zeroable_src_to_zext:
|
; AVX2OR512VL-LABEL: zeroable_src_to_zext:
|
||||||
; AVX2: # %bb.0:
|
; AVX2OR512VL: # %bb.0:
|
||||||
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
; AVX2OR512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||||
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
; AVX2OR512VL-NEXT: retq
|
||||||
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
|
|
||||||
; AVX2-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512VLBW-LABEL: zeroable_src_to_zext:
|
|
||||||
; AVX512VLBW: # %bb.0:
|
|
||||||
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
||||||
; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
||||||
; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
|
|
||||||
; AVX512VLBW-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512VLVBMI-LABEL: zeroable_src_to_zext:
|
|
||||||
; AVX512VLVBMI: # %bb.0:
|
|
||||||
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [32,33,0,0,0,0,0,0,34,35,0,0,0,0,0,0,36,37,16,16,16,16,16,16,38,39,16,16,16,16,16,16]
|
|
||||||
; AVX512VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
||||||
; AVX512VLVBMI-NEXT: vpermt2b %ymm0, %ymm2, %ymm1
|
|
||||||
; AVX512VLVBMI-NEXT: vmovdqa %ymm1, %ymm0
|
|
||||||
; AVX512VLVBMI-NEXT: retq
|
|
||||||
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||||
%2 = shufflevector <32 x i8> %1, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 20, i32 21, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 22, i32 23, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
|
%2 = shufflevector <32 x i8> %1, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 20, i32 21, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 22, i32 23, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
|
||||||
ret <32 x i8> %2
|
ret <32 x i8> %2
|
||||||
|
|
Loading…
Reference in New Issue